{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-08-16T08:29:38.337702Z",
     "start_time": "2017-08-16T08:29:38.332519Z"
    }
   },
   "source": [
    "# Building word vectors"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-11-06T08:18:49.005860Z",
     "start_time": "2017-11-06T08:18:47.143066Z"
    }
   },
   "outputs": [],
   "source": [
    "import sys\n",
    "import os\n",
    "\n",
    "import re\n",
    "import collections\n",
    "import itertools\n",
    "import bcolz\n",
    "import pickle\n",
    "sys.path.append('../lib')\n",
    "\n",
    "import gc\n",
    "import random\n",
    "import smart_open\n",
    "import h5py\n",
    "import csv\n",
    "import tensorflow as tf\n",
    "import gensim\n",
    "\n",
    "import datetime as dt\n",
    "from tqdm import tqdm_notebook as tqdm\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "\n",
    "random_state_number = 967898"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-11-06T08:18:49.583583Z",
     "start_time": "2017-11-06T08:18:49.007195Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['/gpu:0', '/gpu:1']"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from tensorflow.python.client import device_lib\n",
    "def get_available_gpus():\n",
    "    local_device_protos = device_lib.list_local_devices()\n",
    "    return [x.name for x in local_device_protos if x.device_type == 'GPU']\n",
    "\n",
    "config = tf.ConfigProto()\n",
    "config.gpu_options.allow_growth=True\n",
    "sess = tf.Session(config=config)\n",
    "get_available_gpus()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-11-06T08:18:49.802482Z",
     "start_time": "2017-11-06T08:18:49.584787Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Using matplotlib backend: Qt5Agg\n",
      "Populating the interactive namespace from numpy and matplotlib\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/bicepjai/Programs/anaconda3/envs/dsotc-c3/lib/python3.6/site-packages/IPython/core/magics/pylab.py:160: UserWarning: pylab import has clobbered these variables: ['random']\n",
      "`%matplotlib` prevents importing * from pylab and numpy\n",
      "  \"\\n`%matplotlib` prevents importing * from pylab and numpy\"\n"
     ]
    }
   ],
   "source": [
    "%pylab\n",
    "%matplotlib inline\n",
    "%load_ext autoreload\n",
    "%autoreload"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-11-06T08:18:49.807114Z",
     "start_time": "2017-11-06T08:18:49.804242Z"
    }
   },
   "outputs": [],
   "source": [
    "pd.options.mode.chained_assignment = None\n",
    "pd.options.display.max_columns = 999\n",
    "color = sns.color_palette()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## load corpus vocab and wordidx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-11-06T08:22:58.465157Z",
     "start_time": "2017-11-06T08:22:58.295554Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "352220 352220\n"
     ]
    }
   ],
   "source": [
    "corpus_vocab_list, corpus_vocab_wordidx = None, None\n",
    "with open('processed/stage1/vocab_words_wordidx.pkl', 'rb') as f:\n",
    "    (corpus_vocab_list, corpus_wordidx) = pickle.load(f)\n",
    "print(len(corpus_vocab_list), len(corpus_wordidx))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## load data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-11-06T08:21:53.419480Z",
     "start_time": "2017-11-06T08:21:34.064401Z"
    }
   },
   "outputs": [],
   "source": [
    "store = pd.HDFStore('processed/stage1/data_frames.h5')\n",
    "train_df = store['train_df']\n",
    "test_df = store['test_df']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Word Vectors Pre Trained"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## collecting biolab words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-24T18:19:00.564777Z",
     "start_time": "2017-09-24T18:18:13.315297Z"
    }
   },
   "outputs": [],
   "source": [
    "from gensim.models.keyedvectors import KeyedVectors\n",
    "biolab_keyed_vectors_pubmed_pmc_wiki = KeyedVectors.load_word2vec_format('external/biolab_wvs/wikipedia-pubmed-and-PMC-w2v.bin', binary=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-24T18:19:58.165245Z",
     "start_time": "2017-09-24T18:19:57.364009Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5443656"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "biolab_words_pubmed_pmc_wiki = biolab_keyed_vectors_pubmed_pmc_wiki.vocab.keys()\n",
    "biolab_words = set(biolab_words_pubmed_pmc_wiki)\n",
    "len(biolab_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-24T18:20:10.245507Z",
     "start_time": "2017-09-24T18:20:09.630903Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "100489\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'inhibitor1',\n",
       " 'expression19',\n",
       " 'espinosa',\n",
       " 'plate-shaped',\n",
       " '5359',\n",
       " 'adrenal',\n",
       " 'auroras',\n",
       " 'de-emphasized',\n",
       " 'bootstrap',\n",
       " 'glyceraldehyde-3',\n",
       " '3.359',\n",
       " '16/121',\n",
       " 'attenuated',\n",
       " '24.29',\n",
       " 'dounced',\n",
       " 'uroplakins',\n",
       " 'splice-donor',\n",
       " 'gplots',\n",
       " '8/117',\n",
       " 'num',\n",
       " 'featuresof',\n",
       " 'snail',\n",
       " 'favored',\n",
       " 'greater-than-additive',\n",
       " 'forty-seven',\n",
       " 'augment',\n",
       " 'grouch',\n",
       " 'de-activating',\n",
       " 'multiplies',\n",
       " 'renaming',\n",
       " 'bx',\n",
       " 'vy',\n",
       " 'eosinophlia',\n",
       " 'tumors.2',\n",
       " 'excretory',\n",
       " 'ofapoptotic',\n",
       " 'typi',\n",
       " 'under-estimated',\n",
       " 'maya',\n",
       " 'extraosseous',\n",
       " 'rho/rac',\n",
       " 'ci',\n",
       " 'galt',\n",
       " 'anti-b-actin',\n",
       " '1-year',\n",
       " '0562',\n",
       " 'hypodiploid',\n",
       " 'regions.5',\n",
       " 'isoform-selective',\n",
       " 'entero-pancreatic',\n",
       " 'cpt-11',\n",
       " 's-protein',\n",
       " 'ns/s',\n",
       " 'caldesmon',\n",
       " 'p50/p50',\n",
       " 'translatable',\n",
       " 'cytoprotection',\n",
       " 'acneform',\n",
       " 'defeating',\n",
       " '10p11.23',\n",
       " 'pharmacotherapy',\n",
       " 'haemorrhagic',\n",
       " '0.262',\n",
       " '0.20-0.78',\n",
       " 'r=0.352',\n",
       " 'mice19',\n",
       " 'instituted',\n",
       " '4.5.4',\n",
       " '0.04',\n",
       " 'refractions',\n",
       " 'whenas',\n",
       " '20.47',\n",
       " 'cells.15',\n",
       " 'villin-like',\n",
       " 'samples5',\n",
       " 'structure-nonspecific',\n",
       " 'two-stranded',\n",
       " 'lipids',\n",
       " 'justin',\n",
       " 'donner',\n",
       " '3,6',\n",
       " 'data36',\n",
       " 'brakets',\n",
       " 'occured',\n",
       " 'originally',\n",
       " 'ida2',\n",
       " 'utra',\n",
       " '176807',\n",
       " '04/01',\n",
       " '0.936',\n",
       " 'killer',\n",
       " '0.1961',\n",
       " '5-dihydrotestosterone',\n",
       " 'anti-collagen',\n",
       " '7072',\n",
       " 'goiters',\n",
       " 'resting',\n",
       " 'resection.3',\n",
       " 'chip-genotyped',\n",
       " '1.0078',\n",
       " '78-bp',\n",
       " '14q11.1-q12',\n",
       " '2-sheet',\n",
       " 'colonectomy',\n",
       " '1660',\n",
       " '4880',\n",
       " 'anti-cytokeratin',\n",
       " 'chlorambucil',\n",
       " 'p-32',\n",
       " 'p=0.090',\n",
       " 'b3a2',\n",
       " 'techne',\n",
       " '1,257',\n",
       " 'phytosome',\n",
       " '2,787',\n",
       " 'closed-loop',\n",
       " 'descending',\n",
       " 'target-of-rapamycin',\n",
       " 'scheid',\n",
       " 'sink',\n",
       " '851',\n",
       " 'interaction29',\n",
       " 'aliquoted',\n",
       " 'purposes',\n",
       " 'tridimensional',\n",
       " '3,346',\n",
       " 'desensitizing',\n",
       " 'aaa',\n",
       " '16-wk',\n",
       " 'irs',\n",
       " 'smiley',\n",
       " 'har',\n",
       " '16,27',\n",
       " 'lgrs',\n",
       " 'alludes',\n",
       " '29-kb',\n",
       " 'i3',\n",
       " '2.42',\n",
       " 'well-polarized',\n",
       " 'variable.11',\n",
       " 'reproducibility',\n",
       " '0.993',\n",
       " 'replicates/treatment',\n",
       " 'calciumphosphate',\n",
       " 'apoprotinin',\n",
       " 'heterozygosity',\n",
       " 'ccl20',\n",
       " 'observing',\n",
       " 'adenovirus-transformed',\n",
       " 'exposition',\n",
       " '4/97',\n",
       " 'tumourigenicity',\n",
       " 'flavonolignans',\n",
       " 'nonmalignant',\n",
       " 'migration-promoting',\n",
       " 'affinity',\n",
       " '35.6',\n",
       " 'toothbrush',\n",
       " 'moter',\n",
       " 'outcome.21',\n",
       " 'mvs',\n",
       " 'volvulus',\n",
       " 'survival36',\n",
       " 'caspase3',\n",
       " 'al.,1997',\n",
       " 'nakai',\n",
       " 'nedd1',\n",
       " 'transcriptome/proteome',\n",
       " 'technique.22',\n",
       " 'softmax',\n",
       " 'magnus',\n",
       " 'growth-controlling',\n",
       " '3667',\n",
       " '29-89',\n",
       " 'naked',\n",
       " 'ideograms',\n",
       " 'structure/folding',\n",
       " 'panelb',\n",
       " 'burns',\n",
       " 'linger',\n",
       " 'decatenation',\n",
       " 'dated',\n",
       " 'vectors',\n",
       " '7.8-fold',\n",
       " '4425',\n",
       " '5.798',\n",
       " '70m',\n",
       " 'cresyl',\n",
       " 'pathways.20',\n",
       " 'subfertile',\n",
       " 'lymphoid/myeloid',\n",
       " 'julie',\n",
       " 'labia',\n",
       " 'plaque',\n",
       " 'apoptogenic',\n",
       " 'hemin',\n",
       " 'experimentales',\n",
       " 'ichthyosis-like',\n",
       " 'rats6',\n",
       " 'therapy.25',\n",
       " 'contraception',\n",
       " 'moderate/strong',\n",
       " 'self-activating',\n",
       " 'trastuzumab-refractory',\n",
       " '0.941',\n",
       " 'magnetic-activated',\n",
       " 'invokes',\n",
       " 'arnt',\n",
       " 'picker',\n",
       " '127/132',\n",
       " 'tracing',\n",
       " 'cells/0.1',\n",
       " 'k11',\n",
       " 'deceases',\n",
       " 'stabilization',\n",
       " 'disfavoured',\n",
       " 'voracious',\n",
       " 'transparency',\n",
       " 'changed',\n",
       " 'larvae',\n",
       " 'spc216',\n",
       " 'levelis',\n",
       " '2ug',\n",
       " 'andcontrol',\n",
       " 'unwilling',\n",
       " 'geldanamycin',\n",
       " '2687',\n",
       " 'p53-deficient',\n",
       " '61.6',\n",
       " '77.85',\n",
       " 'tetramer-monomer',\n",
       " '10-9',\n",
       " '188-192',\n",
       " 'histological',\n",
       " 'recapitulation',\n",
       " 'osi',\n",
       " 'beater',\n",
       " '89.8',\n",
       " 'dermatome',\n",
       " 'worrying',\n",
       " '72-hr',\n",
       " 'multichambered',\n",
       " '5169',\n",
       " 'interleukin-10',\n",
       " '37',\n",
       " 'scrubbing',\n",
       " 'uniprot',\n",
       " 'history.1',\n",
       " '0.00001',\n",
       " '524625',\n",
       " '31/88',\n",
       " 'metalloproteinase-1',\n",
       " '2:2',\n",
       " 'antiphosphotyrosine',\n",
       " 'vitro.32',\n",
       " '0.151',\n",
       " 'plummet',\n",
       " 'hit',\n",
       " 'doughy',\n",
       " 'challenges',\n",
       " 'flag-epitope',\n",
       " 'ala9',\n",
       " 'bmyb',\n",
       " 'garlic',\n",
       " 'atpases',\n",
       " 'function,25',\n",
       " '6787',\n",
       " 'droplet',\n",
       " 'bones',\n",
       " '103.7',\n",
       " 'lnx',\n",
       " '2578',\n",
       " 'ov',\n",
       " 'nicely',\n",
       " 'available',\n",
       " 'ind',\n",
       " '10-fold',\n",
       " 'system,1',\n",
       " 'organism',\n",
       " 'x-y',\n",
       " '14,24',\n",
       " '3a',\n",
       " 'non-stem',\n",
       " 'glyceraldehyde-3-phosphate-dehydrogenase',\n",
       " '15/88',\n",
       " 'blosum',\n",
       " 'impute',\n",
       " 'ileitis',\n",
       " 'serum-activated',\n",
       " 'iswith',\n",
       " 'scorable',\n",
       " 'cyp24a1',\n",
       " 'sstr1',\n",
       " 'repair-mediated',\n",
       " 't311',\n",
       " 'speckled',\n",
       " 'sc-398',\n",
       " 'data22',\n",
       " '9079',\n",
       " '952',\n",
       " 'dacomitinib',\n",
       " 'm-bcr-abl',\n",
       " '9077',\n",
       " 'arginine/serine-rich',\n",
       " 'plink',\n",
       " 'skp-1',\n",
       " '20q13',\n",
       " '9122',\n",
       " 'theres',\n",
       " 'lls',\n",
       " 'trichothiodystrophy',\n",
       " 'uteroglobin',\n",
       " 'outcome.10',\n",
       " 'polymorphism-mediated',\n",
       " 'skii',\n",
       " 'familiar',\n",
       " 'let-7-5p',\n",
       " 'rs1050171',\n",
       " 'pgr',\n",
       " 'mice.84',\n",
       " 'warm',\n",
       " 'hexakisphosphate',\n",
       " 'kif11',\n",
       " '15.43',\n",
       " 'polydactyly',\n",
       " 'hydrolysis',\n",
       " 'seven-stranded',\n",
       " '154534',\n",
       " 'aminoalkylamino',\n",
       " 'pro-proliferative',\n",
       " '4-methoxy-6-nitro',\n",
       " '0.0232',\n",
       " 'insuling',\n",
       " 'distribution',\n",
       " '9/188',\n",
       " 'rs138213197',\n",
       " 'light-density',\n",
       " '0.477',\n",
       " 'length-adjusted',\n",
       " 'campomelia',\n",
       " 'jarvi',\n",
       " 'puromycin-containing',\n",
       " 'ptc2',\n",
       " 'mol/kg',\n",
       " '02.00',\n",
       " 'tumorsuppressive',\n",
       " 'nonchimeric',\n",
       " 'ruan',\n",
       " 'gsr',\n",
       " 'enact',\n",
       " 'vimentin-positive',\n",
       " 'time-courses',\n",
       " '4,266',\n",
       " '0279',\n",
       " '4360',\n",
       " '2.5-kb',\n",
       " 'acc2',\n",
       " 'antiviral',\n",
       " '7.4/1',\n",
       " 'i0.5',\n",
       " 'fluorine',\n",
       " '36-year',\n",
       " 'sierra',\n",
       " 'inapparently',\n",
       " 'flaring',\n",
       " 'poikilocytes',\n",
       " 'reported.27',\n",
       " '0.00388',\n",
       " '15-year-old',\n",
       " 'whan',\n",
       " '16,35',\n",
       " 'transcription-modulating',\n",
       " '0.496',\n",
       " '88.5',\n",
       " 'albanese',\n",
       " 'p161',\n",
       " '3in',\n",
       " 'disease.1',\n",
       " 'hits',\n",
       " '7000',\n",
       " 'dko',\n",
       " '30-amino-acid',\n",
       " 'lissencephaly',\n",
       " 'aqueous',\n",
       " 'squared',\n",
       " 'populations3',\n",
       " 'insect',\n",
       " 'dual-label',\n",
       " 'rac2',\n",
       " 'cells.21',\n",
       " 'co-operates',\n",
       " 'motic',\n",
       " '5574',\n",
       " 'hmga1',\n",
       " 'numbered',\n",
       " 'correspondence',\n",
       " 'eps8',\n",
       " '2,4-dinitrophenyl',\n",
       " '16/46',\n",
       " '0.0083',\n",
       " '25,117',\n",
       " '26,29',\n",
       " 'expression.25',\n",
       " 'regrowth',\n",
       " '19-4',\n",
       " 'avanti',\n",
       " 'pheomelanins',\n",
       " 'one-quarter',\n",
       " 'p=0.82',\n",
       " 'di-methylation',\n",
       " 'calibration',\n",
       " 'strikes',\n",
       " 'gene.38',\n",
       " '12-transmembrane',\n",
       " 'fuji',\n",
       " 'pre-normalization',\n",
       " 'ref.6',\n",
       " 'surprising',\n",
       " '11,19,20',\n",
       " 'spuriously',\n",
       " 'nhs',\n",
       " '6,22',\n",
       " 'immunohistologically',\n",
       " '30.00',\n",
       " 'complementarity',\n",
       " '3.036',\n",
       " '64-117',\n",
       " 'trio',\n",
       " 'twostep',\n",
       " 'expression.47',\n",
       " 'anastomosed',\n",
       " 'plasmid-encoding',\n",
       " '30-120',\n",
       " '2500',\n",
       " '2218',\n",
       " '4/167',\n",
       " 'arterio-venous',\n",
       " 'diphtheria',\n",
       " 'cilia-based',\n",
       " 'gadolinium',\n",
       " '1839',\n",
       " 'rad',\n",
       " 'interactions.24',\n",
       " 'n=146',\n",
       " '1p21',\n",
       " 'anti-keratin',\n",
       " 'landi',\n",
       " 'frost',\n",
       " 'single-dose',\n",
       " '400-bp',\n",
       " 'calnexin',\n",
       " 'sequence-specificity',\n",
       " 'protein.40',\n",
       " 'undissected',\n",
       " 'bayesian',\n",
       " 'gfp-positive',\n",
       " 'ruby',\n",
       " 'ivs38-8t',\n",
       " '2624',\n",
       " 'jobs',\n",
       " 'replication.9',\n",
       " 'poor-prognosis',\n",
       " 'recurs',\n",
       " 'library-selected',\n",
       " 'model-phased',\n",
       " '1658',\n",
       " '28006',\n",
       " 'thoughtfully',\n",
       " 'degradation.25',\n",
       " 'interphasic',\n",
       " 'acid-soluble',\n",
       " 'authenticity',\n",
       " '068',\n",
       " '356',\n",
       " 'expectancy',\n",
       " 'mir-124a',\n",
       " 'bipositional',\n",
       " 'submitters',\n",
       " 'single-mutant',\n",
       " '0.588',\n",
       " '3.3.5',\n",
       " '3757',\n",
       " 'lhb',\n",
       " 'p38-specific',\n",
       " '3341',\n",
       " 'seldomly',\n",
       " 'arrestin',\n",
       " '15.50',\n",
       " 'lpd',\n",
       " 'land',\n",
       " 'micros',\n",
       " 'lowing',\n",
       " 'manometry',\n",
       " 'mandating',\n",
       " '11,470',\n",
       " 'official',\n",
       " 'onartuzumab',\n",
       " '2,579',\n",
       " 'dishes',\n",
       " 'ad5',\n",
       " '74',\n",
       " 'albumin-containing',\n",
       " '6.877',\n",
       " 'lowess',\n",
       " 'donn',\n",
       " 'integrin-signaling',\n",
       " '1x5',\n",
       " 't529',\n",
       " '7d4',\n",
       " '1925-2002',\n",
       " 'enolase',\n",
       " '0.1-cm',\n",
       " 'powerpc',\n",
       " '28/64',\n",
       " '211-230',\n",
       " 'tachibana',\n",
       " 'androgen-binding',\n",
       " 'exchanging',\n",
       " 'q14',\n",
       " 'milliseconds',\n",
       " '14/19',\n",
       " '6.99',\n",
       " 'nuclease-mediated',\n",
       " 'st3',\n",
       " 'through',\n",
       " 'train',\n",
       " '27.33',\n",
       " 'small-bowel',\n",
       " 'processes.1,2',\n",
       " 'gavaged',\n",
       " 'breed',\n",
       " 'chromatography-tandem',\n",
       " 'exclusively',\n",
       " '13,14,17',\n",
       " '7-28',\n",
       " 'reln',\n",
       " 'subclone',\n",
       " 'salicylates',\n",
       " 'time-frame',\n",
       " 'colorimetric',\n",
       " '11/41',\n",
       " 'n=234',\n",
       " 'kpl',\n",
       " 'discriminate',\n",
       " 'high-leukemia',\n",
       " 'fernando',\n",
       " '4/29',\n",
       " 'oncocytes',\n",
       " 'notions',\n",
       " 'quasi-native',\n",
       " 'coopted',\n",
       " 'ministre',\n",
       " 'miscategorized',\n",
       " 'lopes',\n",
       " 'voom',\n",
       " 'sub-confluence',\n",
       " '2hr',\n",
       " 'geniticin',\n",
       " '0.3-0.7',\n",
       " '1.2kg',\n",
       " 'p=0.0124',\n",
       " 'state.3',\n",
       " 'extirpation',\n",
       " 'cell-stress',\n",
       " '14,733',\n",
       " '3min',\n",
       " 'unc',\n",
       " '64.1',\n",
       " 'crescentic',\n",
       " '5149',\n",
       " 'caucasoids',\n",
       " '20ug',\n",
       " '2021',\n",
       " 'sc-520',\n",
       " 'centrifuged',\n",
       " 'ror',\n",
       " 'extreme',\n",
       " 'years,19',\n",
       " 'iii.a',\n",
       " '21.05',\n",
       " 'ischaemic',\n",
       " 'localization/accumulation',\n",
       " '0.3ml',\n",
       " 'fastest-rising',\n",
       " 'lat',\n",
       " 'deglycosylates',\n",
       " 'doesnt',\n",
       " '21-specific',\n",
       " 'www.oncomine.org',\n",
       " 'estrogen-receptor',\n",
       " 'robo4',\n",
       " 'el',\n",
       " 'particularities',\n",
       " 'halo',\n",
       " 'neuropsychiatric',\n",
       " 'overrepresent',\n",
       " 'mediated-apoptosis',\n",
       " 'v6.5',\n",
       " 'oxide',\n",
       " 'exon13',\n",
       " '3-amino-9-ethylcarbazole',\n",
       " 'juxtapositions',\n",
       " 'finger/toe',\n",
       " 'pathways.11',\n",
       " 'nanoseconds',\n",
       " '20-nucleotide',\n",
       " 'xdh',\n",
       " 'spdbv',\n",
       " '3800',\n",
       " 'prox',\n",
       " '2887',\n",
       " '3623',\n",
       " 'barbieri',\n",
       " 'r2=0.63',\n",
       " 'cells/100-mm',\n",
       " 'herpesvirus-2',\n",
       " 'ear-2',\n",
       " '8,41',\n",
       " 'silvestri',\n",
       " 'para',\n",
       " 'probemix',\n",
       " '48,000',\n",
       " 'crenolanib',\n",
       " 'pcl1',\n",
       " 'mmm',\n",
       " 'rationalization',\n",
       " 'stainer',\n",
       " 'potent',\n",
       " 'doughnut',\n",
       " 'pretest',\n",
       " '72-77',\n",
       " 'intraductal',\n",
       " 'fibril-like',\n",
       " 'bcl2l1',\n",
       " 'phenelzine',\n",
       " 'two-plasmid',\n",
       " 'mnp',\n",
       " 'v0',\n",
       " 'ultravision',\n",
       " '84103',\n",
       " 'cod',\n",
       " 'metabolomics',\n",
       " 'late-apoptotic',\n",
       " 'outcome14',\n",
       " '28s/18s',\n",
       " 'fat-poor',\n",
       " 'ey',\n",
       " 'immunologist',\n",
       " '12.36',\n",
       " '0.291',\n",
       " 'reported3',\n",
       " 'osteomas',\n",
       " 'digit',\n",
       " 'laurenti',\n",
       " '162.3',\n",
       " 'ptch2',\n",
       " '12260',\n",
       " 'm6a',\n",
       " '7-3-1',\n",
       " '3-1-3',\n",
       " 'proliferation-promoting',\n",
       " '1965',\n",
       " 'delayed',\n",
       " 'toll',\n",
       " 'organs.3',\n",
       " 'non-severe',\n",
       " 'nal',\n",
       " 'years2',\n",
       " 'hai',\n",
       " 'experimented',\n",
       " 'epidithiodiketopiperazine',\n",
       " '0.66',\n",
       " 'genes32',\n",
       " 'tcf8',\n",
       " '70.5',\n",
       " 'responses',\n",
       " 'cin4',\n",
       " '853',\n",
       " 'v-abl-mediated',\n",
       " 'cut-off',\n",
       " 'structure.24',\n",
       " '0.00019',\n",
       " 'computer-assisted',\n",
       " 'specialised',\n",
       " '3620',\n",
       " 'sphenoid',\n",
       " 'ctr',\n",
       " 'moves',\n",
       " 'elastase',\n",
       " 'rewarded',\n",
       " '5330',\n",
       " 'daughters',\n",
       " 'models.6',\n",
       " 'anatomical',\n",
       " 'wef',\n",
       " 'painted',\n",
       " '20.49',\n",
       " 'sharing',\n",
       " '0.0944',\n",
       " 'enucleation',\n",
       " 'trimethylase',\n",
       " 'incurable',\n",
       " 'isoformspecific',\n",
       " '6:8',\n",
       " 'subdomains',\n",
       " 'potentially',\n",
       " 'peroxisome-biogenesis',\n",
       " 'deb',\n",
       " 'wales',\n",
       " 'lysine-to-glutamine',\n",
       " 'b-casein',\n",
       " '9360',\n",
       " 'centralis',\n",
       " '0.887',\n",
       " 'n=159',\n",
       " 'precedes',\n",
       " '54,000',\n",
       " 'lei',\n",
       " 'perceivably',\n",
       " 'monoubiquitinylation',\n",
       " 'antigen-induced',\n",
       " 'seven-blade',\n",
       " '0.618',\n",
       " 'unveils',\n",
       " 'tartrate',\n",
       " 'accidents',\n",
       " '9-25',\n",
       " 'bureau',\n",
       " 'nongenetic',\n",
       " 'phospho-substrates',\n",
       " 'din',\n",
       " 'activity30',\n",
       " 'lum',\n",
       " 'uence',\n",
       " 'polymorphism-based',\n",
       " 'cross-linked',\n",
       " 'immunodysregulation',\n",
       " 'rephosphorylation',\n",
       " 'particulate',\n",
       " 'inour',\n",
       " 'beta-strands',\n",
       " 'ume',\n",
       " '1959',\n",
       " '30.8',\n",
       " 'not.2',\n",
       " 'hh-related',\n",
       " 'hamon',\n",
       " '848',\n",
       " '1.60',\n",
       " 'hydroxylamine',\n",
       " 'sufficiency',\n",
       " '5394',\n",
       " 'thoracentesis',\n",
       " 'instillations',\n",
       " 'keratohyalin',\n",
       " 'tetrakisphosphate',\n",
       " 'andg',\n",
       " 'ramped',\n",
       " '2l/min',\n",
       " 'v03',\n",
       " 'ureter',\n",
       " '2-positive',\n",
       " 'perception',\n",
       " '1,556',\n",
       " 'aberration',\n",
       " 'admission',\n",
       " 'hu',\n",
       " '4130',\n",
       " 'studies16,17',\n",
       " 'section1',\n",
       " 'convoluted',\n",
       " '0088',\n",
       " 'concentrator-5',\n",
       " '2003a',\n",
       " 'carb',\n",
       " 'post-extension',\n",
       " 'vismione',\n",
       " 'gapless',\n",
       " 'transient-transfection',\n",
       " '1200',\n",
       " 'thyroglobulin',\n",
       " 'alphavbeta3',\n",
       " 'laue',\n",
       " '13,16',\n",
       " 'dyshomeostasis',\n",
       " 'unirradiated',\n",
       " 'redefines',\n",
       " 'non-smad',\n",
       " 'cells/progenitors',\n",
       " 'unliganded',\n",
       " 'patients.12',\n",
       " 'cancer.10',\n",
       " 'acca',\n",
       " 'spib',\n",
       " '1985',\n",
       " 'prdm1',\n",
       " 'permissions',\n",
       " 'study,12',\n",
       " 'whim',\n",
       " 'thymuses',\n",
       " 'ltx',\n",
       " 'mean+s.d',\n",
       " 'phospho-site',\n",
       " 'mid-log',\n",
       " '7q11',\n",
       " 'myelocytic',\n",
       " 'saleem',\n",
       " 'chromosome9',\n",
       " '40mg/kg',\n",
       " 'itim',\n",
       " '65-7',\n",
       " 'silences',\n",
       " 'extra',\n",
       " 'cardial',\n",
       " 'a280',\n",
       " 'plk2',\n",
       " 'b.f',\n",
       " 'subsaturating',\n",
       " '0.02a',\n",
       " 'k-b',\n",
       " 'thr',\n",
       " 'back-focal',\n",
       " 'message',\n",
       " '4459',\n",
       " '95/96',\n",
       " 'transconjugants',\n",
       " 'k71',\n",
       " 'speciesin',\n",
       " '0.286',\n",
       " 'solver',\n",
       " '3780',\n",
       " 'unpermeabilized',\n",
       " '105/well',\n",
       " 'atretic',\n",
       " 'auto-inhibitory',\n",
       " 'side-effects',\n",
       " 'fiberoptic',\n",
       " 'cancer-cell-autonomous',\n",
       " '2.263',\n",
       " 'tumors4',\n",
       " 'unequalled',\n",
       " 'aura',\n",
       " 'no2',\n",
       " '5ms',\n",
       " 'cells10',\n",
       " '0150',\n",
       " '44.8',\n",
       " 'stability',\n",
       " 'dysontogenetic',\n",
       " 'pter',\n",
       " '0.1548',\n",
       " '44,670',\n",
       " 'loop-binding',\n",
       " 'malm',\n",
       " 'collectively',\n",
       " '56.7',\n",
       " 'quinoxaline',\n",
       " '62-fold',\n",
       " 'recurrence',\n",
       " 'displacing',\n",
       " '168-172',\n",
       " 'expression.48',\n",
       " 'rpb1',\n",
       " 'platelet-specific',\n",
       " 'sequence-tagged',\n",
       " 'directs',\n",
       " '23,9',\n",
       " 'unmethylation',\n",
       " 'unlike',\n",
       " 'operator',\n",
       " '14-h',\n",
       " 'indonesian',\n",
       " 'diffi',\n",
       " 'vmd',\n",
       " '18/34',\n",
       " '150mg/day',\n",
       " '0.79',\n",
       " 'base-pairs',\n",
       " '3/229',\n",
       " 'diagnosis.6,7',\n",
       " 'blunter',\n",
       " 'unidentified',\n",
       " '1-592',\n",
       " '510-513',\n",
       " 'www.bioconductor.org',\n",
       " '0117',\n",
       " '736',\n",
       " 'papular',\n",
       " 'cry',\n",
       " 'syntelic',\n",
       " '631',\n",
       " '100/total',\n",
       " 'non-sensitised',\n",
       " 'macrophage-colony-stimulating',\n",
       " 'non-consensus',\n",
       " 'sino-nasal',\n",
       " 'polymerase-1',\n",
       " 'mack',\n",
       " 'sitedirected',\n",
       " 'emblematic',\n",
       " 'progressive/recurrent',\n",
       " 'cycle-sequencing',\n",
       " '62-69',\n",
       " 'heterodimerization-dependent',\n",
       " 'inactivity',\n",
       " 'uml',\n",
       " 'celery',\n",
       " '0.0007',\n",
       " 'separable',\n",
       " '1760',\n",
       " '6897',\n",
       " '4,609',\n",
       " 'apical',\n",
       " 'transformation/transcription',\n",
       " 'mini-exon',\n",
       " 'autoacetylation',\n",
       " 'responsivity',\n",
       " 'super-infected',\n",
       " 'norte',\n",
       " 'n=507',\n",
       " '0.4325',\n",
       " 'economically',\n",
       " 'non-staining',\n",
       " 'translocations',\n",
       " 'attractions',\n",
       " 'pompa',\n",
       " 'polyacrylamide-urea',\n",
       " '55-bp',\n",
       " '5-terminus',\n",
       " 'tglu',\n",
       " 'caspase-8-specific',\n",
       " '10595',\n",
       " 'ile',\n",
       " 'mammary',\n",
       " 're-suspended',\n",
       " 'protein.6',\n",
       " '89.94',\n",
       " '95-3',\n",
       " '58.3',\n",
       " '1.3e',\n",
       " 'pre-tx',\n",
       " 'gamma-treated',\n",
       " '46.9',\n",
       " 'treatment.13',\n",
       " 'accumu-lation',\n",
       " '1d',\n",
       " 'al.,2008',\n",
       " 'messengers',\n",
       " 'protein-treated',\n",
       " 'primetime',\n",
       " '2/31',\n",
       " '2491',\n",
       " '13039',\n",
       " 'mismapping',\n",
       " 'control.6',\n",
       " 'sub-chromosomal',\n",
       " 'tumors7',\n",
       " '98.02',\n",
       " '1,1-biphenyl',\n",
       " 'preserved',\n",
       " 'development1',\n",
       " 'phosphorylation-mediated',\n",
       " '100,101',\n",
       " 'vala',\n",
       " 'ctype',\n",
       " 'rs2230782',\n",
       " 'argi',\n",
       " '3-case',\n",
       " 'mills',\n",
       " 'scores14',\n",
       " '1111',\n",
       " '8079',\n",
       " 'ubiquitylating',\n",
       " 'hemming',\n",
       " 'sptan1',\n",
       " '0.47',\n",
       " 'three-generational',\n",
       " 'bothersome',\n",
       " 'p=0.64',\n",
       " 'pla2',\n",
       " 'matter',\n",
       " 'immune-surveillance',\n",
       " '0.037',\n",
       " '355',\n",
       " '9,197',\n",
       " 'phosphatase-anti-alkaline',\n",
       " '3a2',\n",
       " 'class-switch',\n",
       " 'n=127',\n",
       " 'sub-ependymal',\n",
       " 'chemotherapy-refractory',\n",
       " 'telecanthus',\n",
       " 'rps29',\n",
       " 'n42',\n",
       " 'immunoassayed',\n",
       " 'mir-194',\n",
       " 'methoda',\n",
       " 'nutator',\n",
       " 'methylation-free',\n",
       " 'fluorescence-activated',\n",
       " ...}"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vocab_biolab = set(biolab_words) & set(vocab_words)\n",
    "print (len(vocab_biolab))\n",
    "vocab_biolab"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-24T18:20:26.479694Z",
     "start_time": "2017-09-24T18:20:25.862613Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "251731\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'',\n",
       " 'mscv-nup214-abl1-ires-gfp',\n",
       " 'limma.18',\n",
       " 'c3h/1oth/2',\n",
       " '0.41.0',\n",
       " 'stablee',\n",
       " 'gdc-0879mediated',\n",
       " 'a62t',\n",
       " 'tumorsnamely',\n",
       " 'her2-so/cep17-sg',\n",
       " 'buffersubcloned',\n",
       " 'measurementss.d',\n",
       " 'cbreast',\n",
       " 'rppametastasis-associated',\n",
       " 'phenylalanine18',\n",
       " 'detectableboard',\n",
       " 'c.17981799gt',\n",
       " 'ofdoes',\n",
       " 'olfm4',\n",
       " 'fkbp12rapamycin',\n",
       " 'phenotype.lambdoid',\n",
       " 'fragment17',\n",
       " 'slc34a2',\n",
       " '20gap',\n",
       " 'suppressorwell',\n",
       " 'saciisite',\n",
       " 'reportednrnac.1a',\n",
       " 'd177y',\n",
       " 'observeddownloadin',\n",
       " 'p454s',\n",
       " 'only.7.four',\n",
       " 'ganglioglioma.25',\n",
       " 'glissons',\n",
       " 'puastderkwt',\n",
       " 'lamp2-positive',\n",
       " 'a/gagarose',\n",
       " 'dahln',\n",
       " 'obstructiona',\n",
       " 'etv1suggest',\n",
       " 'e32g',\n",
       " 'catga',\n",
       " 'identity22',\n",
       " 'mm00455685_m1',\n",
       " 'cys72asp74',\n",
       " 'itdalleles',\n",
       " 'coimmunoprecipitatedwith',\n",
       " 'andpepstatin',\n",
       " 'micej',\n",
       " '4016m1r219',\n",
       " 'treatment-relatedareas',\n",
       " 'smap.23',\n",
       " 'r70w',\n",
       " 'pt3n1',\n",
       " 'genessupplementaryincluding',\n",
       " '0.064-2.262',\n",
       " '2a763v',\n",
       " 'observations29',\n",
       " 'p14arf/dapk/p53',\n",
       " 'a314vexhibited',\n",
       " '321a',\n",
       " 'severalfragment',\n",
       " 'cellreaction',\n",
       " 'referred.haplotype',\n",
       " 'kemler',\n",
       " 's114x.11',\n",
       " 'dpp-8400574',\n",
       " '5-gttgaacggtggccacaccggc-3',\n",
       " '1501g-a',\n",
       " 'lymphoid-signaling',\n",
       " '2002tf',\n",
       " 'c1275y',\n",
       " 'tyr646',\n",
       " 'ar-v1',\n",
       " 'p-eif2',\n",
       " 'p53fl',\n",
       " '30345',\n",
       " 't241m',\n",
       " 'l288sfish',\n",
       " 'peripherydownloadin',\n",
       " 't/ti',\n",
       " 'l273m',\n",
       " 'carriers22',\n",
       " 'materialfig',\n",
       " '02097810',\n",
       " 'andets',\n",
       " 'visvanathan',\n",
       " '29leiomyosarcoma+0.750.31.171.831.44',\n",
       " 'substrates/oncogenic',\n",
       " 'pre-b/t-all',\n",
       " 'p.a148t',\n",
       " 'i11t',\n",
       " 'all.rapamycin',\n",
       " 'tumour-extracted',\n",
       " 'autogizer',\n",
       " 'supplemental3h',\n",
       " '53/411',\n",
       " 'azevedo',\n",
       " 'h50r',\n",
       " 'mutation.among',\n",
       " 'insectin',\n",
       " 'apc-coupled',\n",
       " 'muci/gram',\n",
       " 'x87838',\n",
       " 'homotrimers.options',\n",
       " 'nsii',\n",
       " 'teetthvvmktdaefvcertlkyflgiaggkwvvsyfwvtqsikerkmlnehdfevrgdv',\n",
       " 'min.the',\n",
       " 'rs743185',\n",
       " 'fiftyfive',\n",
       " 'c.625c',\n",
       " 'g360a',\n",
       " 'needof',\n",
       " 'c2h2-type',\n",
       " 'r369w',\n",
       " 'asxl1msh6unfortunately',\n",
       " '211k24',\n",
       " '10f4252pancolonic1.8+apyes',\n",
       " 'msi+24poly',\n",
       " 'med12/cyclin',\n",
       " 'localtest',\n",
       " 'rate.largedownload',\n",
       " 'co-defining',\n",
       " 'johinaz.cgen',\n",
       " 'l115p',\n",
       " '65:5563',\n",
       " '4afigure',\n",
       " 'rv560-561',\n",
       " 'pit-2',\n",
       " 'pd03259019',\n",
       " 'palb2-associated',\n",
       " 'akt3z',\n",
       " 'proteinandgrowth',\n",
       " '9one',\n",
       " 'kolodnerterminal',\n",
       " 'r905',\n",
       " 'butby',\n",
       " 'denysdrash',\n",
       " 'proline-287',\n",
       " 'solexa-illumina',\n",
       " 'fgfr3.32,33',\n",
       " 'irish/german',\n",
       " 'meanslevels',\n",
       " 'lxxll-like',\n",
       " 'pe1e2s1',\n",
       " 'pone.0064364.e015.jpg1.0',\n",
       " \"5'-agttccactcttagaggtag-3\",\n",
       " 'fujigaoka',\n",
       " 'lex1',\n",
       " 'v261m',\n",
       " 'y17126',\n",
       " 'inectmwt-1',\n",
       " 'bcdk6',\n",
       " 'gilhuis',\n",
       " 'd1203',\n",
       " 'supplementarymutantsassessed',\n",
       " 'nb1224',\n",
       " 'element/binding',\n",
       " 'transduction.5,6',\n",
       " 'cr-oe33',\n",
       " 'h1417d',\n",
       " 'cysts/multinodular',\n",
       " '2c/min',\n",
       " '5-aatagattctggcattgtggtccccgttttcttatggg-3',\n",
       " 'h-actin',\n",
       " 'gimema',\n",
       " 'd18n',\n",
       " 'd1s2737',\n",
       " '15,000r112c',\n",
       " 'resper',\n",
       " 'h701',\n",
       " 's34f/y',\n",
       " 'c.6014c',\n",
       " 'e295k',\n",
       " 'h2aub1',\n",
       " 'response32.3',\n",
       " 'pik3cathat',\n",
       " 'tg101209.our',\n",
       " 'v379a',\n",
       " 'cd4-cell',\n",
       " 'g4450a',\n",
       " 'bamhi/aflii',\n",
       " 'fgf-mapk-pathway',\n",
       " 'o61267',\n",
       " 'kumamoto',\n",
       " '12.9other',\n",
       " 'introns43',\n",
       " 'l2575',\n",
       " 'y126',\n",
       " 'c.2221-126c',\n",
       " 'gtctctcccttgaaatgctgtga',\n",
       " 'mplw515l-expressing',\n",
       " 'ezh2h689a,19',\n",
       " 'animals13',\n",
       " 'elf-3',\n",
       " 'v218g',\n",
       " '24772/pnf',\n",
       " 'fhgenomic',\n",
       " 'tmprss2erg3',\n",
       " 'xyftqtllpglag',\n",
       " 'tac272tat',\n",
       " 'workresistance',\n",
       " 'nbd-gdp/gtp',\n",
       " '9121s',\n",
       " 'venkataramani',\n",
       " 'bp21',\n",
       " 'f451l',\n",
       " 'epithelial183',\n",
       " 'nut-positive',\n",
       " '212-717-3203',\n",
       " 'q0q1dt18',\n",
       " 'q661',\n",
       " 'gcagctgcccggggccgaca',\n",
       " '0.00010.13',\n",
       " 's1898f',\n",
       " 'dexseq.29',\n",
       " '19.6os',\n",
       " 'mda231',\n",
       " 'respectively.sequencing',\n",
       " 'nonfunctioningmaximizing',\n",
       " 'routineuse',\n",
       " 'notch1-responsive',\n",
       " 'ascom-mll3',\n",
       " 'therapeutics130',\n",
       " 'cellcompound-protein',\n",
       " 'univ-lvon1.fr',\n",
       " 'processesfor',\n",
       " 'centrosomes7a',\n",
       " 'amino-binding',\n",
       " 'cells125',\n",
       " 'amx500',\n",
       " 'sdsr703p',\n",
       " 'adenocpoor50m40japan',\n",
       " 'panasonic',\n",
       " 'i220',\n",
       " \"5'-gagtgctctaatgactgactgaga-3'/5'-aaaggtgacatggaaagccc-3\",\n",
       " 'foldin',\n",
       " '0/134',\n",
       " 'tissuessupplementaryof',\n",
       " 'c.1702_1703del',\n",
       " 'ea64',\n",
       " 'v37m',\n",
       " 'normalsupplementalmargaret',\n",
       " 'lymphocytosissupplementarynormal',\n",
       " 'eachrepresents',\n",
       " 'l1546n',\n",
       " 'f.d.n',\n",
       " 'a259raf-13b',\n",
       " 'c141y',\n",
       " 'eitherslcros',\n",
       " 'esr1-e380q',\n",
       " 'www-huber.embl.de/users/anders/htseq',\n",
       " 'tyr1278',\n",
       " 'viadetected',\n",
       " 'ubbe',\n",
       " 'coitus.56runx1',\n",
       " 'domain/ras',\n",
       " 'pdgf-a/c',\n",
       " 'phospho-erbb2',\n",
       " 'supplementarysmarca4',\n",
       " 'ethyleneglycoltetracetic',\n",
       " 'wascomparable',\n",
       " 'hsc/clp',\n",
       " 'tim-craf',\n",
       " '24hec',\n",
       " 'thegene',\n",
       " 'fdr=0.01',\n",
       " 'datafig',\n",
       " 'nowak-wegrzyn',\n",
       " '1321g',\n",
       " 'brc15',\n",
       " 'mef2btranscript',\n",
       " 'panitumumabf',\n",
       " 'aml.15,23,27',\n",
       " '0019059',\n",
       " 'for13',\n",
       " '1779insc',\n",
       " 'r257g',\n",
       " 'p.leu597val',\n",
       " 'l766pproteins',\n",
       " 'hgf-mediated',\n",
       " 'uhplc-qqtof-ms',\n",
       " 'msh6.67',\n",
       " 'frame.29',\n",
       " 'backx',\n",
       " 'glu118',\n",
       " 'pcaf-mediated',\n",
       " '596718',\n",
       " 'www.arup.utah.edu',\n",
       " 'investigations,36',\n",
       " 'ands21',\n",
       " 'inicd1.reverse',\n",
       " 'paired-primers',\n",
       " 'h1155',\n",
       " 'showedremarkable',\n",
       " '5-cagtttctgtctgctaggag-3',\n",
       " 'hal-b2',\n",
       " 'assaysleft',\n",
       " 's966q',\n",
       " 'al-elein',\n",
       " 'genesunfortunately',\n",
       " 'nickel-1,2-dioleolyl-sn-glycero-3',\n",
       " 'mandatory.26',\n",
       " 'wereaccording',\n",
       " 'gli1-driven',\n",
       " 'tyrosinethus',\n",
       " 'bc-his6lane',\n",
       " 'garcia-olive',\n",
       " 'p27t198v',\n",
       " 'thr75met75',\n",
       " 'intoprb16',\n",
       " 'th2-helper',\n",
       " 'iswi-related',\n",
       " 'sos1e846k',\n",
       " 'spoplalso',\n",
       " 'dicer16',\n",
       " 'lung29,30',\n",
       " 'serleu',\n",
       " '1-trcn0000074283',\n",
       " 'proteinsone',\n",
       " 'psq-containing',\n",
       " 'anti-erk-2',\n",
       " 'suppression.it',\n",
       " 'gst-erk2',\n",
       " 'nacl',\n",
       " 'doublet=en',\n",
       " 'samplewould',\n",
       " '503-220-3405',\n",
       " 'erbb4.23',\n",
       " 'tyr279cys',\n",
       " 'studied1',\n",
       " 'erk1k71r/erk2k54r',\n",
       " 'irizarry',\n",
       " 'fiscella',\n",
       " 'infiltratingresistance',\n",
       " 't49.8',\n",
       " 'acidsubstitutions',\n",
       " '600.options',\n",
       " 'hadeggs',\n",
       " 'whenof',\n",
       " '11resulting',\n",
       " 'cellsmander',\n",
       " 'methylation4,12',\n",
       " 'caggtcttgatgtacttccctcgtttgtgcagc',\n",
       " 'leu-536',\n",
       " 'ppm1d-mutant',\n",
       " 't232',\n",
       " 'v101m',\n",
       " '1592delt',\n",
       " 'atpprotein',\n",
       " 'examineddiagnosed',\n",
       " 'resistance3b',\n",
       " \"5'-gggaccggcttaatccatag-3\",\n",
       " 'biotecnologies',\n",
       " 'rxrb',\n",
       " 'assays.72',\n",
       " 'slidenorthern',\n",
       " 'sdfl',\n",
       " 'pixsys',\n",
       " 'gcn5/pcaf',\n",
       " 'e6.5-e9.5',\n",
       " 'fkrkhkkdisqnkravrr',\n",
       " 'mir-34c-5p',\n",
       " 'mores3',\n",
       " 'd024',\n",
       " 'approximation.20,21',\n",
       " 'nuclearextracts',\n",
       " 'cases.7,10',\n",
       " 'mmmt',\n",
       " 'poulikakos',\n",
       " 'tecknica',\n",
       " '140481275-140481298',\n",
       " 'anti-ar3',\n",
       " 'p70s6k-t389',\n",
       " 'ccagagtgctctaatgactg',\n",
       " 'c.ten',\n",
       " 'york-presbyterian/columbia',\n",
       " 'alone5',\n",
       " 'dnindividuals',\n",
       " '3376554583moderate',\n",
       " 'gfp-ezh2y641',\n",
       " 'kmeindl',\n",
       " '657185',\n",
       " 'anti-phospho-c-kit',\n",
       " 'knies-bamforth',\n",
       " 'aneuploidy.micrographs',\n",
       " 'cancers4547',\n",
       " 'sox11-negative',\n",
       " 'hsc7e116',\n",
       " 'inhibitor7figure',\n",
       " '2003-00328',\n",
       " 'gfoldwt',\n",
       " '10wtwtwtwtwtwtwtwtwtwt1',\n",
       " 'translational/targeted',\n",
       " '3q4u',\n",
       " 'betap2loop',\n",
       " 'geneperhaps',\n",
       " 'coot.embl.de',\n",
       " 'homologywith',\n",
       " 'cyclearrested',\n",
       " 'skin.58',\n",
       " 'delinsl',\n",
       " 'q1537rq1537r',\n",
       " 'peprotec',\n",
       " '302131',\n",
       " 'thusthere',\n",
       " 'micecarrying',\n",
       " 'speciwc',\n",
       " '430k10',\n",
       " 'f57l',\n",
       " 'b-rafq257rsupplementarys2',\n",
       " 'pdgfrp',\n",
       " 'prmd1',\n",
       " 'targets13',\n",
       " 'reflections14,83521,181',\n",
       " 'myc-reconstituted',\n",
       " 'polymorphism.in',\n",
       " 'ibgc4.7,13,14',\n",
       " 'bar=100m',\n",
       " 'sspi',\n",
       " 'clegg',\n",
       " 'inknockdown',\n",
       " 'inactivation,16',\n",
       " 'set1a/b',\n",
       " '4division',\n",
       " 'v1m',\n",
       " 'datayes',\n",
       " 'tctgcagcagcaggcaga',\n",
       " 'l276praf',\n",
       " 'mutras',\n",
       " 'lats2-expressing',\n",
       " 'studyf',\n",
       " 'dccd',\n",
       " 'flj126847.7col8a1acollagen',\n",
       " 'pvhl-defective',\n",
       " 'lys569',\n",
       " 'm243-f1695',\n",
       " 'that2',\n",
       " 'homology2',\n",
       " '78260810',\n",
       " 'sh_1',\n",
       " 'zc3h12b',\n",
       " 'songet',\n",
       " 'g1157s',\n",
       " 'medium10',\n",
       " 'k433r',\n",
       " '28+ndnd',\n",
       " '4087197',\n",
       " 'missense427.5nonsense11820.9frameshift',\n",
       " '1207/1605',\n",
       " 'cellsac',\n",
       " '5cctcctaccttggcattaca3',\n",
       " 'dosagef',\n",
       " 'debrauwere',\n",
       " 'betweendifferent',\n",
       " '20092013',\n",
       " '9p272s111310',\n",
       " 'wild-typeqi',\n",
       " 'pfdn6',\n",
       " '121anormalgacaspgacatcnegneg',\n",
       " 'receptorlung',\n",
       " 'ic50sdeletions',\n",
       " '986995',\n",
       " 'appbp1-uba3',\n",
       " 'r482c',\n",
       " 'g272d',\n",
       " 'g779s',\n",
       " 'ezrint567dwith',\n",
       " 'familyin',\n",
       " 'heterodimer.a',\n",
       " 'log-calculation',\n",
       " '18.2g',\n",
       " 'c27a',\n",
       " 'pastorfide',\n",
       " 'vaco5',\n",
       " 'shnf1e',\n",
       " 'checkpointlines',\n",
       " 'nhr14',\n",
       " '4515mbtnamissenseex8c.856c',\n",
       " 'sda-containing',\n",
       " 'melanoma38.4',\n",
       " 'hla-c*04:09n',\n",
       " 'reverse5-tcagtccataagccaagctctca-3',\n",
       " 'c.1501g4a',\n",
       " 'tnfsf11/rankl',\n",
       " 'a226tfigure',\n",
       " 'e640',\n",
       " '89991',\n",
       " '13871784',\n",
       " 'gelsi',\n",
       " 'rasbraf',\n",
       " 'a218v',\n",
       " 'g1202r.15alk',\n",
       " 'ac-ii',\n",
       " 'patientp22',\n",
       " 'sirnas2b',\n",
       " 'hafner',\n",
       " 'l858rand',\n",
       " 'c.s.hill',\n",
       " 'r196l',\n",
       " 'leukemias.40',\n",
       " 'leukemogeneic',\n",
       " 'separateof',\n",
       " 'mutations24',\n",
       " 'whalin',\n",
       " 'trkai/ii',\n",
       " 'e478k',\n",
       " 'rb-/-/p107',\n",
       " 'slidewe',\n",
       " 'samplescommon',\n",
       " 'phospho-kit',\n",
       " 'ckitwild-type',\n",
       " '3.1-pdgfr',\n",
       " 'cytoplasmic.the',\n",
       " 'asn117ser',\n",
       " 'nonpolyposispackage',\n",
       " 'shecases',\n",
       " 'lincscloud',\n",
       " 'daystop',\n",
       " 'p1087r',\n",
       " 'tcccctgttgattccctaga',\n",
       " '11f41',\n",
       " 'pittsburgh.sequence',\n",
       " 'gfr/pkc',\n",
       " 'c797s',\n",
       " 'torc1',\n",
       " 'ct60',\n",
       " 'genesdna',\n",
       " 't155i',\n",
       " 'informationapart',\n",
       " 'ttaggatgagcctctcctagactt',\n",
       " 'previously.4,17,22',\n",
       " 'indmem',\n",
       " '92.5101.5',\n",
       " 'she78-7',\n",
       " 'tia1',\n",
       " 'make.cells',\n",
       " '23995711',\n",
       " 'ds-55004',\n",
       " 'l430p',\n",
       " '426-521',\n",
       " 'method74',\n",
       " 'angio100',\n",
       " 'non-bat-rii',\n",
       " '368-5698',\n",
       " 'armigate',\n",
       " 'hpvprominently',\n",
       " '3.50+0.20',\n",
       " 'atcontent',\n",
       " 'fc2s',\n",
       " 'agentscolony',\n",
       " 'supp.s1',\n",
       " 'genomicer',\n",
       " 'cmv-vp16-tfap2a',\n",
       " '241-269',\n",
       " 'lys105',\n",
       " 'caagtattggtctctcgtctttcagctggataaggtctggtttaatgc',\n",
       " '0.27.6',\n",
       " 'physicallyeither',\n",
       " 'flag-traf6',\n",
       " 'nct00312377',\n",
       " 'ahcyl1',\n",
       " 'fgfr1,17',\n",
       " 'www.broadinstitute.org/cancer/software/genepattern',\n",
       " 'populations180',\n",
       " 'srp033306',\n",
       " 'n1380',\n",
       " 'wererequired',\n",
       " 'bindingcrystal',\n",
       " 'p087',\n",
       " '256kb',\n",
       " 'accagcca-ccactttctgatagg',\n",
       " 'catcccatggtggc*gggatggttgcagaag',\n",
       " 'issubset',\n",
       " 'certam',\n",
       " 'd446v',\n",
       " 'syndromecausative',\n",
       " 'y339',\n",
       " 'pone.0064364.e006.jpg',\n",
       " 'functioning.the',\n",
       " 'ligandlane',\n",
       " 'tyr791phe',\n",
       " 'c896',\n",
       " 'fgf8=1.79769e+308',\n",
       " 'ikk3',\n",
       " 'f99s',\n",
       " 'tasimilar',\n",
       " '2e758ga1l798f/ha4a864q',\n",
       " 'nucleotide.asxl1',\n",
       " 'y537s3045950.0110.0001',\n",
       " 'nes=2.05',\n",
       " '17q22-2514daint-213q32-33',\n",
       " 'fluor-555',\n",
       " 'flankswere',\n",
       " 'tgs-6',\n",
       " 'p73mutant',\n",
       " 'yasuji',\n",
       " 'p.asn127del',\n",
       " '2122-nt',\n",
       " 'cimp16',\n",
       " 'tamra-ins',\n",
       " 'fibroblasts5',\n",
       " 'serum5figure',\n",
       " 'studiesrnai',\n",
       " 'chromosome17q11.2',\n",
       " 'proliferationa',\n",
       " 'pci-neo-baf250',\n",
       " 't-e-y',\n",
       " 'd609g',\n",
       " 'lagerstedt',\n",
       " 'aptag-1',\n",
       " 'gcmn',\n",
       " 'tttggaagctctcagggtac',\n",
       " 'syndromes15,21',\n",
       " 'sophie',\n",
       " 'ciovacco',\n",
       " 'immunoblottingand',\n",
       " 'injeclion',\n",
       " 'c.2149g',\n",
       " 'pa2g4p4',\n",
       " 'dach1',\n",
       " 'sud-luxembourg',\n",
       " 'd85n',\n",
       " 'e600w',\n",
       " 'krasthe',\n",
       " '1mq4c',\n",
       " 'h193qp53',\n",
       " 'peg3',\n",
       " 'a40v',\n",
       " 'theerror-containing',\n",
       " 'g112e',\n",
       " 'messiaen',\n",
       " '586del',\n",
       " 'repeats5,6',\n",
       " 'mekwith',\n",
       " 'r748g',\n",
       " 'mpl.34-37',\n",
       " \"c'l\",\n",
       " 'nopho',\n",
       " 'bosmuller',\n",
       " 'devol',\n",
       " 'rs66944506',\n",
       " '4494503',\n",
       " 'arer207w',\n",
       " 'cellsdataset',\n",
       " 'fhl1-induced',\n",
       " 'wolf-hirschhorn',\n",
       " 'diderot',\n",
       " 'msh2-vd862msh6p',\n",
       " 'p3xflag-cmv-wild-type-chk2',\n",
       " '3.24+0.18',\n",
       " 'small.1',\n",
       " 'c.856g',\n",
       " 'masp2',\n",
       " 'shinmura',\n",
       " 'improm-ii',\n",
       " 'hs00368175_m1',\n",
       " '2supplementary1',\n",
       " 'ewsr1-2',\n",
       " 'd86n',\n",
       " 'errfi14a',\n",
       " 'egflane',\n",
       " 'f224lcontain',\n",
       " 'y859',\n",
       " 'smai/bglii',\n",
       " \"5'-atcatgtttgagaccttcaa-3\",\n",
       " 'e1a-binding',\n",
       " 'this25',\n",
       " '106the',\n",
       " 't/p.p214l',\n",
       " '0.20.3',\n",
       " 'supplementary11a',\n",
       " 'randerath',\n",
       " 'antigal4',\n",
       " 'chek1',\n",
       " 'differenceq276p',\n",
       " 'schwaller',\n",
       " 'g20a',\n",
       " 'htlv-iinfected',\n",
       " 'together.13,14,36',\n",
       " 'kms-9',\n",
       " 'pegfp-flag-pdz',\n",
       " 'antibody15,34,39',\n",
       " 't0.31',\n",
       " 'amc.uva.nl',\n",
       " 'c304',\n",
       " 'proteinsupplementaryno',\n",
       " '5001,600',\n",
       " 'arid1b-associated',\n",
       " 'receptor-smads',\n",
       " '5-ctggaagcaaagacggacaa-3',\n",
       " 'pznctj2-q205l',\n",
       " 'h1881',\n",
       " '7235g',\n",
       " 'siles',\n",
       " 'balkwill',\n",
       " 'mycmaxmxd',\n",
       " 'p2lv-h-rasand',\n",
       " 'theevent',\n",
       " 'il-1-il-1r',\n",
       " 'etoh/hanks',\n",
       " 'f7425',\n",
       " 'inhibitionsupplement',\n",
       " '5nmis',\n",
       " '10.1007/s10147-013-0602-1',\n",
       " 'et163950',\n",
       " 'future-the',\n",
       " 'erlotinib.81',\n",
       " 'non-transactivating',\n",
       " 'flt3mf5',\n",
       " 'comparedunfortunately',\n",
       " 'nucleotidesthe',\n",
       " 'p.q684x',\n",
       " '31544',\n",
       " 'resultsliposarcomas',\n",
       " 'adenocarcinomas.28,44,47',\n",
       " 'mutant-egfr',\n",
       " 'pgex-ecorv-sac1',\n",
       " 'specificallyt790m',\n",
       " 'lmpl',\n",
       " 'cisa',\n",
       " 'yeast,3',\n",
       " 'k708',\n",
       " 'competitorin',\n",
       " '0.500.83',\n",
       " 'donnem',\n",
       " 'l858r-substituted',\n",
       " 'paip1s-luc',\n",
       " 'c.425',\n",
       " 'bclanes',\n",
       " 'immunoblottingfor',\n",
       " 'm-v5',\n",
       " 'f37',\n",
       " 't4m0',\n",
       " 'smad4-reconstituted',\n",
       " 'ct-a/cyt-1',\n",
       " 'pladienolide,64',\n",
       " 'fgfr2-cit',\n",
       " 'stag2l360w',\n",
       " 'polymicrogyria-postaxial',\n",
       " 'methodsr300h',\n",
       " 'glfg-2',\n",
       " 'flag-elf3n233',\n",
       " 'anti-gata3',\n",
       " 'pml/rar-expressing',\n",
       " 'n=48supplementary',\n",
       " 'catagg',\n",
       " 'ssc/0.3',\n",
       " 'variation_29880',\n",
       " 'ew-sm1-989',\n",
       " 'restrictionpattern',\n",
       " 'function5677insatruncated',\n",
       " 'dmempten',\n",
       " 'condensedf',\n",
       " '2683-2711',\n",
       " 'pe-fluorescence',\n",
       " 'pathogenica',\n",
       " '3.285614467',\n",
       " 'w406r',\n",
       " 'ampmdm2',\n",
       " 'srf-staining',\n",
       " 'zfn639',\n",
       " 'y6a',\n",
       " 'c-16',\n",
       " 'c3h/10t/2',\n",
       " 'y-meso-27',\n",
       " 'fc=1.46',\n",
       " 'g482v',\n",
       " 'transformationrelevant',\n",
       " 'p332q',\n",
       " 'lys3326tersdhb',\n",
       " '4337123',\n",
       " 'p53-/-/p210',\n",
       " 'bjornsti',\n",
       " 'allele34,35',\n",
       " 'theiii',\n",
       " 'igf-i-induced',\n",
       " '2.1289126',\n",
       " 'e40t',\n",
       " 'bc012846.1',\n",
       " 'cellssupplementalimportantly',\n",
       " 'vigers',\n",
       " 'anassays',\n",
       " 'brim-311',\n",
       " 'rp11-295i5',\n",
       " 'ttce2',\n",
       " 'idh2,15',\n",
       " 'detectedlanes',\n",
       " 'bcr-ablt315itransformed',\n",
       " 'il-9r/c',\n",
       " 'esr-y537n',\n",
       " 'melanomasl',\n",
       " 'g418-positive',\n",
       " 'primersthat',\n",
       " 'wee1hu',\n",
       " 'hrp-coupled',\n",
       " 'wild-type-likemutant',\n",
       " '4/490',\n",
       " 'leu298',\n",
       " 'collagen-5a1',\n",
       " 'p.q9x',\n",
       " 'hetzer',\n",
       " 'gradeprognostic',\n",
       " 'wm278-gfp',\n",
       " 'spen-specific',\n",
       " 'contexts.conclusionin',\n",
       " 'e02641959224774121743269288',\n",
       " 'tmm18',\n",
       " 'g662eschematic',\n",
       " 'h585d',\n",
       " 'gagtcatcaattttattctgactgatcc',\n",
       " 'myc5a',\n",
       " '37yrs',\n",
       " 'pro261-induced',\n",
       " 'ng/lin',\n",
       " 'post-enucleated',\n",
       " 's1py-bound',\n",
       " 'v281i',\n",
       " 'disease0',\n",
       " 'bartletts',\n",
       " 'cellssupplementaryoverall',\n",
       " 'modifications.9,10',\n",
       " '5-caaga-acagcaacgagtaccg-3',\n",
       " 'ostium-primum',\n",
       " 'female3823',\n",
       " 'pms2-interactive',\n",
       " 'aacggtaccaaggctgagaa',\n",
       " 'texasred-conjugated',\n",
       " 'lskeffect',\n",
       " 'mfinal',\n",
       " 'spop-mediated',\n",
       " 'h-galactosidase',\n",
       " '70.169.2',\n",
       " 'esmedical',\n",
       " 'budssuch',\n",
       " 'reorganizedin',\n",
       " 'r246w',\n",
       " '32p-4e-bp1',\n",
       " \"5'-aatgcccat\",\n",
       " 'anti-phospho-c-met',\n",
       " '68/f',\n",
       " 'trastuzumab.25',\n",
       " 'snu-886',\n",
       " 'cplcknditkrslqestrfsqlveellkii',\n",
       " 'htertnot',\n",
       " '1.282.17',\n",
       " 'satoko',\n",
       " '5-aaggtgttgcaatccccagc-3',\n",
       " 'matrix.41',\n",
       " 'mutationssupplementarythese',\n",
       " 'gccagcattttagcattacttc',\n",
       " 'a864t',\n",
       " 'indouble',\n",
       " 't49.2/t49.3',\n",
       " '9206s',\n",
       " 'promoter-ttadriven',\n",
       " 's1/kh',\n",
       " 'interaction17',\n",
       " 'unknown/na14.813.223.8',\n",
       " 'p110-d964a',\n",
       " 'jak3-stat3',\n",
       " 'extendmr',\n",
       " 'y-733',\n",
       " 'pbabe-zeo-nrasg12d',\n",
       " 'mek1c121s-expressing',\n",
       " 'anti-pdgfc',\n",
       " 'asp-220',\n",
       " 'g1567d',\n",
       " '211403',\n",
       " 'polyacryiamide',\n",
       " 'p.val600_lys601',\n",
       " 'jim3',\n",
       " 'statethe',\n",
       " 'staphylococcusbetween',\n",
       " \"5'-attacacagtatcctcgaca-3\",\n",
       " 'ha-11',\n",
       " 'u3-1287',\n",
       " 'di7s2sorepeating',\n",
       " 'hadshould',\n",
       " 'y1003x',\n",
       " 'sd/agar',\n",
       " 'differentiation,3',\n",
       " 'staalesen',\n",
       " '70800',\n",
       " 'activity.23,24',\n",
       " 'againstwap-cre',\n",
       " 'g162rnormalnormaldeficientdeleterious0.00pathogenic',\n",
       " 'gly719x',\n",
       " 'mds19',\n",
       " 'd464g',\n",
       " 'mode.14',\n",
       " 'y88c',\n",
       " 'asp770_asn771insmetalathrpro',\n",
       " 'oxiod',\n",
       " '50target',\n",
       " 'q1496h',\n",
       " 'c135r',\n",
       " 'crizotinibc',\n",
       " 'sequencing.10',\n",
       " 'rotterdam.26',\n",
       " 'defectao105yes',\n",
       " 't198a/g',\n",
       " '14.981.7',\n",
       " 'locatedleading',\n",
       " 'sex.1984',\n",
       " 'capacityrestoring',\n",
       " '20-mmol/l',\n",
       " 'butdownloadin',\n",
       " 'makita',\n",
       " 'syndromeassociated',\n",
       " 'newman-keuls',\n",
       " '5-gatggtgggggccctcctctt-3',\n",
       " 'mutagenesismutants',\n",
       " 'analysisimmunohistochemical',\n",
       " 'rs3f',\n",
       " 'that.altogether',\n",
       " '030/50',\n",
       " 'conformationgrowthcodon',\n",
       " 'gln184stop',\n",
       " 'arrest.the',\n",
       " '3.40+0.12',\n",
       " 'a328p6ntp91l',\n",
       " 'status22',\n",
       " 'sample.sanger',\n",
       " 'pocket4a',\n",
       " 's71w',\n",
       " 'sdcbp2',\n",
       " 'm114',\n",
       " 'phospho-ros1',\n",
       " 'nm_007817',\n",
       " 'ntgtg',\n",
       " 'lossoffunction',\n",
       " 'eachaffected',\n",
       " '71f/57stomach453spvery',\n",
       " 'anti-cd4',\n",
       " '144k',\n",
       " 'scalia',\n",
       " 'microcystein-lr',\n",
       " 'repairatpase',\n",
       " 'me6',\n",
       " 'erk.of',\n",
       " 'tfsearch',\n",
       " 'xeds/eels',\n",
       " 'her2+/pi3k-mutant',\n",
       " 's6o',\n",
       " 'p5-specic',\n",
       " 'l65f',\n",
       " 'antibody.of',\n",
       " 'a3062g',\n",
       " 'malignancy13',\n",
       " 'locusb',\n",
       " 'hkr1',\n",
       " 'supplemental5a-b',\n",
       " '2chighwire_math',\n",
       " 'sc1747',\n",
       " 'turnhout',\n",
       " 'nrap',\n",
       " 'wasthree',\n",
       " '413641513',\n",
       " 'metnm_000245c2646tp814sgermline1/0',\n",
       " 'binitial',\n",
       " 'tgcggacagg',\n",
       " 'carcinoma987512.512.5040',\n",
       " 'lsab+system-hrp',\n",
       " 'ret-ntrk1',\n",
       " 'nordenstadt',\n",
       " 'molecularn114s',\n",
       " 'rotolib2.aa',\n",
       " 'backgroundcould',\n",
       " 'n105kthe',\n",
       " 'y110c',\n",
       " '395one',\n",
       " 'complexes129',\n",
       " 'sulfateinduced',\n",
       " 'shrna-smad2',\n",
       " 'c-dx',\n",
       " 'trii/alk-x',\n",
       " 'sa-b',\n",
       " 'h920',\n",
       " 'referencingca',\n",
       " 'ctm-5990506',\n",
       " 'er+breast',\n",
       " '33-kda',\n",
       " 'alkf1178l',\n",
       " '5tgcaaggtggagcgattctg',\n",
       " 'withclosure',\n",
       " 'substitutionb',\n",
       " 't-lymphotropic',\n",
       " 'pa30824101none0',\n",
       " 'lm3d-induced',\n",
       " 'vomiting30',\n",
       " 't479pvitro',\n",
       " '5-acacgtccccatctgaag-3',\n",
       " 'a279vprotein',\n",
       " 'br67',\n",
       " 'decribed.8',\n",
       " 'complementarygene',\n",
       " 'hoxd8',\n",
       " '5-uauauuuauauauuagacgdgdg-3',\n",
       " 'skp1or',\n",
       " 'imatinib.the',\n",
       " ...}"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vocab_not_in_biolab =set(vocab_words) - set(biolab_words)\n",
    "print(len(vocab_not_in_biolab))\n",
    "vocab_not_in_biolab"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "dont need word to id dict since this is indexed with words"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## using biolab words for missing corpus words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-24T18:26:23.487488Z",
     "start_time": "2017-09-24T18:26:23.479630Z"
    }
   },
   "outputs": [],
   "source": [
    "undesirable_ascii_characters = list(range(32))\n",
    "undesirable_ascii_characters.remove(10) #keep new line since this might be used for sentence tokenizer\n",
    "undesirable_charmap = dict.fromkeys(undesirable_ascii_characters)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-24T18:28:58.621275Z",
     "start_time": "2017-09-24T18:28:47.364162Z"
    }
   },
   "outputs": [],
   "source": [
    "from nltk import word_tokenize\n",
    "from utils import custom_word_tokenizer, apply_custom_regx\n",
    "\n",
    "custom_tokenized_biolab_pubmed_pmc_wiki_wv = {}\n",
    "for word in vocab_biolab:\n",
    "    vector = biolab_keyed_vectors_pubmed_pmc_wiki.word_vec(word)\n",
    "    custom_tokenized_biolab_pubmed_pmc_wiki_wv[word.lower()] = vector\n",
    "    word = word.lower().encode('ascii', 'ignore').decode('utf-8', 'ignore')\n",
    "    word = str(word).translate(undesirable_charmap)\n",
    "    word = apply_custom_regx(word)\n",
    "    word = word.replace('\\\\t', '')\n",
    "    for part in word_tokenize(word):\n",
    "        if part in custom_tokenized_biolab_pubmed_pmc_wiki_wv:\n",
    "            custom_tokenized_biolab_pubmed_pmc_wiki_wv[part] += vector\n",
    "            custom_tokenized_biolab_pubmed_pmc_wiki_wv[part] /= 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-24T18:29:03.176765Z",
     "start_time": "2017-09-24T18:29:03.168848Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "100489"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(custom_tokenized_biolab_pubmed_pmc_wiki_wv)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### for tensorboard"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-24T18:30:11.013346Z",
     "start_time": "2017-09-24T18:30:11.008342Z"
    }
   },
   "outputs": [],
   "source": [
    "tb_vocab_size=5000"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-24T18:36:45.940301Z",
     "start_time": "2017-09-24T18:36:45.836967Z"
    }
   },
   "outputs": [],
   "source": [
    "tb_vocab_biolab = list(vocab_biolab)[:tb_vocab_size]\n",
    "with open(\"view_wvs_tb/tb_vocab.tsv\", \"w\") as fp:\n",
    "    wr = csv.writer(fp, delimiter='\\n')\n",
    "    wr.writerow(tb_vocab_biolab)\n",
    "\n",
    "tb_word_vectors = np.random.randn(tb_vocab_size, 200)\n",
    "for i,word in enumerate(tb_vocab_biolab):\n",
    "    tb_word_vectors[i] = custom_tokenized_biolab_pubmed_pmc_wiki_wv[word]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-24T18:39:20.401339Z",
     "start_time": "2017-09-24T18:39:20.214482Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(5000, 200)\n"
     ]
    }
   ],
   "source": [
    "%autoreload\n",
    "from utils import visualize_embeddings_in_tensorboard\n",
    "visualize_this_embedding = tb_word_vectors\n",
    "print(visualize_this_embedding.shape)\n",
    "metadata_path = \"/home/bicepjai/Projects/dsotc/data_prep/view_wvs_tb/tb_vocab.tsv\"\n",
    "visualize_embeddings_in_tensorboard(visualize_this_embedding, metadata_path, \"/home/bicepjai/Projects/dsotc/data_prep/view_wvs_tb\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-24T18:34:48.243291Z",
     "start_time": "2017-09-24T18:34:48.238267Z"
    }
   },
   "outputs": [],
   "source": [
    "del tb_word_vectors"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## building word vectors of 200d for model\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-24T18:29:28.878519Z",
     "start_time": "2017-09-24T18:29:26.310330Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(352220, 200)"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "corpus_word_vectors = np.random.randn(len(vocab_words), 200)\n",
    "corpus_word_vectors.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-08-24T02:35:49.778344Z",
     "start_time": "2017-08-24T02:35:49.772266Z"
    }
   },
   "source": [
    "fill in biolab vectors available"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-24T18:29:29.029521Z",
     "start_time": "2017-09-24T18:29:28.880040Z"
    }
   },
   "outputs": [],
   "source": [
    "for word in vocab_biolab:\n",
    "    dataset_corpus_word_index = vocab_wordidx[word]\n",
    "    corpus_word_vectors[dataset_corpus_word_index] = custom_tokenized_biolab_pubmed_pmc_wiki_wv[word]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "total words not updated with training from biolab"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-24T18:29:30.681212Z",
     "start_time": "2017-09-24T18:29:30.552877Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "251731"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "words_not_updated = set(vocab_words) - vocab_biolab\n",
    "len(words_not_updated)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-24T18:29:35.743603Z",
     "start_time": "2017-09-24T18:29:35.715108Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'',\n",
       " 'mscv-nup214-abl1-ires-gfp',\n",
       " 'limma.18',\n",
       " 'c3h/1oth/2',\n",
       " '0.41.0',\n",
       " 'stablee',\n",
       " 'gdc-0879mediated',\n",
       " 'a62t',\n",
       " 'tumorsnamely',\n",
       " 'her2-so/cep17-sg',\n",
       " 'buffersubcloned',\n",
       " 'measurementss.d',\n",
       " 'cbreast',\n",
       " 'rppametastasis-associated',\n",
       " 'phenylalanine18',\n",
       " 'detectableboard',\n",
       " 'c.17981799gt',\n",
       " 'ofdoes',\n",
       " 'olfm4',\n",
       " 'fkbp12rapamycin',\n",
       " 'phenotype.lambdoid',\n",
       " 'fragment17',\n",
       " 'slc34a2',\n",
       " '20gap',\n",
       " 'suppressorwell',\n",
       " 'saciisite',\n",
       " 'reportednrnac.1a',\n",
       " 'd177y',\n",
       " 'observeddownloadin',\n",
       " 'p454s',\n",
       " 'only.7.four',\n",
       " 'ganglioglioma.25',\n",
       " 'glissons',\n",
       " 'puastderkwt',\n",
       " 'lamp2-positive',\n",
       " 'a/gagarose',\n",
       " 'dahln',\n",
       " 'obstructiona',\n",
       " 'etv1suggest',\n",
       " 'e32g',\n",
       " 'catga',\n",
       " 'identity22',\n",
       " 'mm00455685_m1',\n",
       " 'cys72asp74',\n",
       " 'itdalleles',\n",
       " 'coimmunoprecipitatedwith',\n",
       " 'andpepstatin',\n",
       " 'micej',\n",
       " '4016m1r219',\n",
       " 'treatment-relatedareas',\n",
       " 'smap.23',\n",
       " 'r70w',\n",
       " 'pt3n1',\n",
       " 'genessupplementaryincluding',\n",
       " '0.064-2.262',\n",
       " '2a763v',\n",
       " 'observations29',\n",
       " 'p14arf/dapk/p53',\n",
       " 'a314vexhibited',\n",
       " '321a',\n",
       " 'severalfragment',\n",
       " 'cellreaction',\n",
       " 'referred.haplotype',\n",
       " 'kemler',\n",
       " 's114x.11',\n",
       " 'dpp-8400574',\n",
       " '5-gttgaacggtggccacaccggc-3',\n",
       " '1501g-a',\n",
       " 'lymphoid-signaling',\n",
       " '2002tf',\n",
       " 'c1275y',\n",
       " 'tyr646',\n",
       " 'ar-v1',\n",
       " 'p-eif2',\n",
       " 'p53fl',\n",
       " '30345',\n",
       " 't241m',\n",
       " 'l288sfish',\n",
       " 'peripherydownloadin',\n",
       " 't/ti',\n",
       " 'l273m',\n",
       " 'carriers22',\n",
       " 'materialfig',\n",
       " '02097810',\n",
       " 'andets',\n",
       " 'visvanathan',\n",
       " '29leiomyosarcoma+0.750.31.171.831.44',\n",
       " 'substrates/oncogenic',\n",
       " 'pre-b/t-all',\n",
       " 'p.a148t',\n",
       " 'i11t',\n",
       " 'all.rapamycin',\n",
       " 'tumour-extracted',\n",
       " 'autogizer',\n",
       " 'supplemental3h',\n",
       " '53/411',\n",
       " 'azevedo',\n",
       " 'h50r',\n",
       " 'mutation.among',\n",
       " 'insectin',\n",
       " 'apc-coupled',\n",
       " 'muci/gram',\n",
       " 'x87838',\n",
       " 'homotrimers.options',\n",
       " 'nsii',\n",
       " 'teetthvvmktdaefvcertlkyflgiaggkwvvsyfwvtqsikerkmlnehdfevrgdv',\n",
       " 'min.the',\n",
       " 'rs743185',\n",
       " 'fiftyfive',\n",
       " 'c.625c',\n",
       " 'g360a',\n",
       " 'needof',\n",
       " 'c2h2-type',\n",
       " 'r369w',\n",
       " 'asxl1msh6unfortunately',\n",
       " '211k24',\n",
       " '10f4252pancolonic1.8+apyes',\n",
       " 'msi+24poly',\n",
       " 'med12/cyclin',\n",
       " 'localtest',\n",
       " 'rate.largedownload',\n",
       " 'co-defining',\n",
       " 'johinaz.cgen',\n",
       " 'l115p',\n",
       " '65:5563',\n",
       " '4afigure',\n",
       " 'rv560-561',\n",
       " 'pit-2',\n",
       " 'pd03259019',\n",
       " 'palb2-associated',\n",
       " 'akt3z',\n",
       " 'proteinandgrowth',\n",
       " '9one',\n",
       " 'kolodnerterminal',\n",
       " 'r905',\n",
       " 'butby',\n",
       " 'denysdrash',\n",
       " 'proline-287',\n",
       " 'solexa-illumina',\n",
       " 'fgfr3.32,33',\n",
       " 'irish/german',\n",
       " 'meanslevels',\n",
       " 'lxxll-like',\n",
       " 'pe1e2s1',\n",
       " 'pone.0064364.e015.jpg1.0',\n",
       " \"5'-agttccactcttagaggtag-3\",\n",
       " 'fujigaoka',\n",
       " 'lex1',\n",
       " 'v261m',\n",
       " 'y17126',\n",
       " 'inectmwt-1',\n",
       " 'bcdk6',\n",
       " 'gilhuis',\n",
       " 'd1203',\n",
       " 'supplementarymutantsassessed',\n",
       " 'nb1224',\n",
       " 'element/binding',\n",
       " 'transduction.5,6',\n",
       " 'cr-oe33',\n",
       " 'h1417d',\n",
       " 'cysts/multinodular',\n",
       " '2c/min',\n",
       " '5-aatagattctggcattgtggtccccgttttcttatggg-3',\n",
       " 'h-actin',\n",
       " 'gimema',\n",
       " 'd18n',\n",
       " 'd1s2737',\n",
       " '15,000r112c',\n",
       " 'resper',\n",
       " 'h701',\n",
       " 's34f/y',\n",
       " 'c.6014c',\n",
       " 'e295k',\n",
       " 'h2aub1',\n",
       " 'response32.3',\n",
       " 'pik3cathat',\n",
       " 'tg101209.our',\n",
       " 'v379a',\n",
       " 'cd4-cell',\n",
       " 'g4450a',\n",
       " 'bamhi/aflii',\n",
       " 'fgf-mapk-pathway',\n",
       " 'o61267',\n",
       " 'kumamoto',\n",
       " '12.9other',\n",
       " 'introns43',\n",
       " 'l2575',\n",
       " 'y126',\n",
       " 'c.2221-126c',\n",
       " 'gtctctcccttgaaatgctgtga',\n",
       " 'mplw515l-expressing',\n",
       " 'ezh2h689a,19',\n",
       " 'animals13',\n",
       " 'elf-3',\n",
       " 'v218g',\n",
       " '24772/pnf',\n",
       " 'fhgenomic',\n",
       " 'tmprss2erg3',\n",
       " 'xyftqtllpglag',\n",
       " 'tac272tat',\n",
       " 'workresistance',\n",
       " 'nbd-gdp/gtp',\n",
       " '9121s',\n",
       " 'venkataramani',\n",
       " 'bp21',\n",
       " 'f451l',\n",
       " 'epithelial183',\n",
       " 'nut-positive',\n",
       " '212-717-3203',\n",
       " 'q0q1dt18',\n",
       " 'q661',\n",
       " 'gcagctgcccggggccgaca',\n",
       " '0.00010.13',\n",
       " 's1898f',\n",
       " 'dexseq.29',\n",
       " '19.6os',\n",
       " 'mda231',\n",
       " 'respectively.sequencing',\n",
       " 'nonfunctioningmaximizing',\n",
       " 'routineuse',\n",
       " 'notch1-responsive',\n",
       " 'ascom-mll3',\n",
       " 'therapeutics130',\n",
       " 'cellcompound-protein',\n",
       " 'univ-lvon1.fr',\n",
       " 'processesfor',\n",
       " 'centrosomes7a',\n",
       " 'amino-binding',\n",
       " 'cells125',\n",
       " 'amx500',\n",
       " 'sdsr703p',\n",
       " 'adenocpoor50m40japan',\n",
       " 'panasonic',\n",
       " 'i220',\n",
       " \"5'-gagtgctctaatgactgactgaga-3'/5'-aaaggtgacatggaaagccc-3\",\n",
       " 'foldin',\n",
       " '0/134',\n",
       " 'tissuessupplementaryof',\n",
       " 'c.1702_1703del',\n",
       " 'ea64',\n",
       " 'v37m',\n",
       " 'normalsupplementalmargaret',\n",
       " 'lymphocytosissupplementarynormal',\n",
       " 'eachrepresents',\n",
       " 'l1546n',\n",
       " 'f.d.n',\n",
       " 'a259raf-13b',\n",
       " 'c141y',\n",
       " 'eitherslcros',\n",
       " 'esr1-e380q',\n",
       " 'www-huber.embl.de/users/anders/htseq',\n",
       " 'tyr1278',\n",
       " 'viadetected',\n",
       " 'ubbe',\n",
       " 'coitus.56runx1',\n",
       " 'domain/ras',\n",
       " 'pdgf-a/c',\n",
       " 'phospho-erbb2',\n",
       " 'supplementarysmarca4',\n",
       " 'ethyleneglycoltetracetic',\n",
       " 'wascomparable',\n",
       " 'hsc/clp',\n",
       " 'tim-craf',\n",
       " '24hec',\n",
       " 'thegene',\n",
       " 'fdr=0.01',\n",
       " 'datafig',\n",
       " 'nowak-wegrzyn',\n",
       " '1321g',\n",
       " 'brc15',\n",
       " 'mef2btranscript',\n",
       " 'panitumumabf',\n",
       " 'aml.15,23,27',\n",
       " '0019059',\n",
       " 'for13',\n",
       " '1779insc',\n",
       " 'r257g',\n",
       " 'p.leu597val',\n",
       " 'l766pproteins',\n",
       " 'hgf-mediated',\n",
       " 'uhplc-qqtof-ms',\n",
       " 'msh6.67',\n",
       " 'frame.29',\n",
       " 'backx',\n",
       " 'glu118',\n",
       " 'pcaf-mediated',\n",
       " '596718',\n",
       " 'www.arup.utah.edu',\n",
       " 'investigations,36',\n",
       " 'ands21',\n",
       " 'inicd1.reverse',\n",
       " 'paired-primers',\n",
       " 'h1155',\n",
       " 'showedremarkable',\n",
       " '5-cagtttctgtctgctaggag-3',\n",
       " 'hal-b2',\n",
       " 'assaysleft',\n",
       " 's966q',\n",
       " 'al-elein',\n",
       " 'genesunfortunately',\n",
       " 'nickel-1,2-dioleolyl-sn-glycero-3',\n",
       " 'mandatory.26',\n",
       " 'wereaccording',\n",
       " 'gli1-driven',\n",
       " 'tyrosinethus',\n",
       " 'bc-his6lane',\n",
       " 'garcia-olive',\n",
       " 'p27t198v',\n",
       " 'thr75met75',\n",
       " 'intoprb16',\n",
       " 'th2-helper',\n",
       " 'iswi-related',\n",
       " 'sos1e846k',\n",
       " 'spoplalso',\n",
       " 'dicer16',\n",
       " 'lung29,30',\n",
       " 'serleu',\n",
       " '1-trcn0000074283',\n",
       " 'proteinsone',\n",
       " 'psq-containing',\n",
       " 'anti-erk-2',\n",
       " 'suppression.it',\n",
       " 'gst-erk2',\n",
       " 'nacl',\n",
       " 'doublet=en',\n",
       " 'samplewould',\n",
       " '503-220-3405',\n",
       " 'erbb4.23',\n",
       " 'tyr279cys',\n",
       " 'studied1',\n",
       " 'erk1k71r/erk2k54r',\n",
       " 'irizarry',\n",
       " 'fiscella',\n",
       " 'infiltratingresistance',\n",
       " 't49.8',\n",
       " 'acidsubstitutions',\n",
       " '600.options',\n",
       " 'hadeggs',\n",
       " 'whenof',\n",
       " '11resulting',\n",
       " 'cellsmander',\n",
       " 'methylation4,12',\n",
       " 'caggtcttgatgtacttccctcgtttgtgcagc',\n",
       " 'leu-536',\n",
       " 'ppm1d-mutant',\n",
       " 't232',\n",
       " 'v101m',\n",
       " '1592delt',\n",
       " 'atpprotein',\n",
       " 'examineddiagnosed',\n",
       " 'resistance3b',\n",
       " \"5'-gggaccggcttaatccatag-3\",\n",
       " 'biotecnologies',\n",
       " 'rxrb',\n",
       " 'assays.72',\n",
       " 'slidenorthern',\n",
       " 'sdfl',\n",
       " 'pixsys',\n",
       " 'gcn5/pcaf',\n",
       " 'e6.5-e9.5',\n",
       " 'fkrkhkkdisqnkravrr',\n",
       " 'mir-34c-5p',\n",
       " 'mores3',\n",
       " 'd024',\n",
       " 'approximation.20,21',\n",
       " 'nuclearextracts',\n",
       " 'cases.7,10',\n",
       " 'mmmt',\n",
       " 'poulikakos',\n",
       " 'tecknica',\n",
       " '140481275-140481298',\n",
       " 'anti-ar3',\n",
       " 'p70s6k-t389',\n",
       " 'ccagagtgctctaatgactg',\n",
       " 'c.ten',\n",
       " 'york-presbyterian/columbia',\n",
       " 'alone5',\n",
       " 'dnindividuals',\n",
       " '3376554583moderate',\n",
       " 'gfp-ezh2y641',\n",
       " 'kmeindl',\n",
       " '657185',\n",
       " 'anti-phospho-c-kit',\n",
       " 'knies-bamforth',\n",
       " 'aneuploidy.micrographs',\n",
       " 'cancers4547',\n",
       " 'sox11-negative',\n",
       " 'hsc7e116',\n",
       " 'inhibitor7figure',\n",
       " '2003-00328',\n",
       " 'gfoldwt',\n",
       " '10wtwtwtwtwtwtwtwtwtwt1',\n",
       " 'translational/targeted',\n",
       " '3q4u',\n",
       " 'betap2loop',\n",
       " 'geneperhaps',\n",
       " 'coot.embl.de',\n",
       " 'homologywith',\n",
       " 'cyclearrested',\n",
       " 'skin.58',\n",
       " 'delinsl',\n",
       " 'q1537rq1537r',\n",
       " 'peprotec',\n",
       " '302131',\n",
       " 'thusthere',\n",
       " 'micecarrying',\n",
       " 'speciwc',\n",
       " '430k10',\n",
       " 'f57l',\n",
       " 'b-rafq257rsupplementarys2',\n",
       " 'pdgfrp',\n",
       " 'prmd1',\n",
       " 'targets13',\n",
       " 'reflections14,83521,181',\n",
       " 'myc-reconstituted',\n",
       " 'polymorphism.in',\n",
       " 'ibgc4.7,13,14',\n",
       " 'bar=100m',\n",
       " 'sspi',\n",
       " 'clegg',\n",
       " 'inknockdown',\n",
       " 'inactivation,16',\n",
       " 'set1a/b',\n",
       " '4division',\n",
       " 'v1m',\n",
       " 'datayes',\n",
       " 'tctgcagcagcaggcaga',\n",
       " 'l276praf',\n",
       " 'mutras',\n",
       " 'lats2-expressing',\n",
       " 'studyf',\n",
       " 'dccd',\n",
       " 'flj126847.7col8a1acollagen',\n",
       " 'pvhl-defective',\n",
       " 'lys569',\n",
       " 'm243-f1695',\n",
       " 'that2',\n",
       " 'homology2',\n",
       " '78260810',\n",
       " 'sh_1',\n",
       " 'zc3h12b',\n",
       " 'songet',\n",
       " 'g1157s',\n",
       " 'medium10',\n",
       " 'k433r',\n",
       " '28+ndnd',\n",
       " '4087197',\n",
       " 'missense427.5nonsense11820.9frameshift',\n",
       " '1207/1605',\n",
       " 'cellsac',\n",
       " '5cctcctaccttggcattaca3',\n",
       " 'dosagef',\n",
       " 'debrauwere',\n",
       " 'betweendifferent',\n",
       " '20092013',\n",
       " '9p272s111310',\n",
       " 'wild-typeqi',\n",
       " 'pfdn6',\n",
       " '121anormalgacaspgacatcnegneg',\n",
       " 'receptorlung',\n",
       " 'ic50sdeletions',\n",
       " '986995',\n",
       " 'appbp1-uba3',\n",
       " 'r482c',\n",
       " 'g272d',\n",
       " 'g779s',\n",
       " 'ezrint567dwith',\n",
       " 'familyin',\n",
       " 'heterodimer.a',\n",
       " 'log-calculation',\n",
       " '18.2g',\n",
       " 'c27a',\n",
       " 'pastorfide',\n",
       " 'vaco5',\n",
       " 'shnf1e',\n",
       " 'checkpointlines',\n",
       " 'nhr14',\n",
       " '4515mbtnamissenseex8c.856c',\n",
       " 'sda-containing',\n",
       " 'melanoma38.4',\n",
       " 'hla-c*04:09n',\n",
       " 'reverse5-tcagtccataagccaagctctca-3',\n",
       " 'c.1501g4a',\n",
       " 'tnfsf11/rankl',\n",
       " 'a226tfigure',\n",
       " 'e640',\n",
       " '89991',\n",
       " '13871784',\n",
       " 'gelsi',\n",
       " 'rasbraf',\n",
       " 'a218v',\n",
       " 'g1202r.15alk',\n",
       " 'ac-ii',\n",
       " 'patientp22',\n",
       " 'sirnas2b',\n",
       " 'hafner',\n",
       " 'l858rand',\n",
       " 'c.s.hill',\n",
       " 'r196l',\n",
       " 'leukemias.40',\n",
       " 'leukemogeneic',\n",
       " 'separateof',\n",
       " 'mutations24',\n",
       " 'whalin',\n",
       " 'trkai/ii',\n",
       " 'e478k',\n",
       " 'rb-/-/p107',\n",
       " 'slidewe',\n",
       " 'samplescommon',\n",
       " 'phospho-kit',\n",
       " 'ckitwild-type',\n",
       " '3.1-pdgfr',\n",
       " 'cytoplasmic.the',\n",
       " 'asn117ser',\n",
       " 'nonpolyposispackage',\n",
       " 'shecases',\n",
       " 'lincscloud',\n",
       " 'daystop',\n",
       " 'p1087r',\n",
       " 'tcccctgttgattccctaga',\n",
       " '11f41',\n",
       " 'pittsburgh.sequence',\n",
       " 'gfr/pkc',\n",
       " 'c797s',\n",
       " 'torc1',\n",
       " 'ct60',\n",
       " 'genesdna',\n",
       " 't155i',\n",
       " 'informationapart',\n",
       " 'ttaggatgagcctctcctagactt',\n",
       " 'previously.4,17,22',\n",
       " 'indmem',\n",
       " '92.5101.5',\n",
       " 'she78-7',\n",
       " 'tia1',\n",
       " 'make.cells',\n",
       " '23995711',\n",
       " 'ds-55004',\n",
       " 'l430p',\n",
       " '426-521',\n",
       " 'method74',\n",
       " 'angio100',\n",
       " 'non-bat-rii',\n",
       " '368-5698',\n",
       " 'armigate',\n",
       " 'hpvprominently',\n",
       " '3.50+0.20',\n",
       " 'atcontent',\n",
       " 'fc2s',\n",
       " 'agentscolony',\n",
       " 'supp.s1',\n",
       " 'genomicer',\n",
       " 'cmv-vp16-tfap2a',\n",
       " '241-269',\n",
       " 'lys105',\n",
       " 'caagtattggtctctcgtctttcagctggataaggtctggtttaatgc',\n",
       " '0.27.6',\n",
       " 'physicallyeither',\n",
       " 'flag-traf6',\n",
       " 'nct00312377',\n",
       " 'ahcyl1',\n",
       " 'fgfr1,17',\n",
       " 'www.broadinstitute.org/cancer/software/genepattern',\n",
       " 'populations180',\n",
       " 'srp033306',\n",
       " 'n1380',\n",
       " 'wererequired',\n",
       " 'bindingcrystal',\n",
       " 'p087',\n",
       " '256kb',\n",
       " 'accagcca-ccactttctgatagg',\n",
       " 'catcccatggtggc*gggatggttgcagaag',\n",
       " 'issubset',\n",
       " 'certam',\n",
       " 'd446v',\n",
       " 'syndromecausative',\n",
       " 'y339',\n",
       " 'pone.0064364.e006.jpg',\n",
       " 'functioning.the',\n",
       " 'ligandlane',\n",
       " 'tyr791phe',\n",
       " 'c896',\n",
       " 'fgf8=1.79769e+308',\n",
       " 'ikk3',\n",
       " 'f99s',\n",
       " 'tasimilar',\n",
       " '2e758ga1l798f/ha4a864q',\n",
       " 'nucleotide.asxl1',\n",
       " 'y537s3045950.0110.0001',\n",
       " 'nes=2.05',\n",
       " '17q22-2514daint-213q32-33',\n",
       " 'fluor-555',\n",
       " 'flankswere',\n",
       " 'tgs-6',\n",
       " 'p73mutant',\n",
       " 'yasuji',\n",
       " 'p.asn127del',\n",
       " '2122-nt',\n",
       " 'cimp16',\n",
       " 'tamra-ins',\n",
       " 'fibroblasts5',\n",
       " 'serum5figure',\n",
       " 'studiesrnai',\n",
       " 'chromosome17q11.2',\n",
       " 'proliferationa',\n",
       " 'pci-neo-baf250',\n",
       " 't-e-y',\n",
       " 'd609g',\n",
       " 'lagerstedt',\n",
       " 'aptag-1',\n",
       " 'gcmn',\n",
       " 'tttggaagctctcagggtac',\n",
       " 'syndromes15,21',\n",
       " 'sophie',\n",
       " 'ciovacco',\n",
       " 'immunoblottingand',\n",
       " 'injeclion',\n",
       " 'c.2149g',\n",
       " 'pa2g4p4',\n",
       " 'dach1',\n",
       " 'sud-luxembourg',\n",
       " 'd85n',\n",
       " 'e600w',\n",
       " 'krasthe',\n",
       " '1mq4c',\n",
       " 'h193qp53',\n",
       " 'peg3',\n",
       " 'a40v',\n",
       " 'theerror-containing',\n",
       " 'g112e',\n",
       " 'messiaen',\n",
       " '586del',\n",
       " 'repeats5,6',\n",
       " 'mekwith',\n",
       " 'r748g',\n",
       " 'mpl.34-37',\n",
       " \"c'l\",\n",
       " 'nopho',\n",
       " 'bosmuller',\n",
       " 'devol',\n",
       " 'rs66944506',\n",
       " '4494503',\n",
       " 'arer207w',\n",
       " 'cellsdataset',\n",
       " 'fhl1-induced',\n",
       " 'wolf-hirschhorn',\n",
       " 'diderot',\n",
       " 'msh2-vd862msh6p',\n",
       " 'p3xflag-cmv-wild-type-chk2',\n",
       " '3.24+0.18',\n",
       " 'small.1',\n",
       " 'c.856g',\n",
       " 'masp2',\n",
       " 'shinmura',\n",
       " 'improm-ii',\n",
       " 'hs00368175_m1',\n",
       " '2supplementary1',\n",
       " 'ewsr1-2',\n",
       " 'd86n',\n",
       " 'errfi14a',\n",
       " 'egflane',\n",
       " 'f224lcontain',\n",
       " 'y859',\n",
       " 'smai/bglii',\n",
       " \"5'-atcatgtttgagaccttcaa-3\",\n",
       " 'e1a-binding',\n",
       " 'this25',\n",
       " '106the',\n",
       " 't/p.p214l',\n",
       " '0.20.3',\n",
       " 'supplementary11a',\n",
       " 'randerath',\n",
       " 'antigal4',\n",
       " 'chek1',\n",
       " 'differenceq276p',\n",
       " 'schwaller',\n",
       " 'g20a',\n",
       " 'htlv-iinfected',\n",
       " 'together.13,14,36',\n",
       " 'kms-9',\n",
       " 'pegfp-flag-pdz',\n",
       " 'antibody15,34,39',\n",
       " 't0.31',\n",
       " 'amc.uva.nl',\n",
       " 'c304',\n",
       " 'proteinsupplementaryno',\n",
       " '5001,600',\n",
       " 'arid1b-associated',\n",
       " 'receptor-smads',\n",
       " '5-ctggaagcaaagacggacaa-3',\n",
       " 'pznctj2-q205l',\n",
       " 'h1881',\n",
       " '7235g',\n",
       " 'siles',\n",
       " 'balkwill',\n",
       " 'mycmaxmxd',\n",
       " 'p2lv-h-rasand',\n",
       " 'theevent',\n",
       " 'il-1-il-1r',\n",
       " 'etoh/hanks',\n",
       " 'f7425',\n",
       " 'inhibitionsupplement',\n",
       " '5nmis',\n",
       " '10.1007/s10147-013-0602-1',\n",
       " 'et163950',\n",
       " 'future-the',\n",
       " 'erlotinib.81',\n",
       " 'non-transactivating',\n",
       " 'flt3mf5',\n",
       " 'comparedunfortunately',\n",
       " 'nucleotidesthe',\n",
       " 'p.q684x',\n",
       " '31544',\n",
       " 'resultsliposarcomas',\n",
       " 'adenocarcinomas.28,44,47',\n",
       " 'mutant-egfr',\n",
       " 'pgex-ecorv-sac1',\n",
       " 'specificallyt790m',\n",
       " 'lmpl',\n",
       " 'cisa',\n",
       " 'yeast,3',\n",
       " 'k708',\n",
       " 'competitorin',\n",
       " '0.500.83',\n",
       " 'donnem',\n",
       " 'l858r-substituted',\n",
       " 'paip1s-luc',\n",
       " 'c.425',\n",
       " 'bclanes',\n",
       " 'immunoblottingfor',\n",
       " 'm-v5',\n",
       " 'f37',\n",
       " 't4m0',\n",
       " 'smad4-reconstituted',\n",
       " 'ct-a/cyt-1',\n",
       " 'pladienolide,64',\n",
       " 'fgfr2-cit',\n",
       " 'stag2l360w',\n",
       " 'polymicrogyria-postaxial',\n",
       " 'methodsr300h',\n",
       " 'glfg-2',\n",
       " 'flag-elf3n233',\n",
       " 'anti-gata3',\n",
       " 'pml/rar-expressing',\n",
       " 'n=48supplementary',\n",
       " 'catagg',\n",
       " 'ssc/0.3',\n",
       " 'variation_29880',\n",
       " 'ew-sm1-989',\n",
       " 'restrictionpattern',\n",
       " 'function5677insatruncated',\n",
       " 'dmempten',\n",
       " 'condensedf',\n",
       " '2683-2711',\n",
       " 'pe-fluorescence',\n",
       " 'pathogenica',\n",
       " '3.285614467',\n",
       " 'w406r',\n",
       " 'ampmdm2',\n",
       " 'srf-staining',\n",
       " 'zfn639',\n",
       " 'y6a',\n",
       " 'c-16',\n",
       " 'c3h/10t/2',\n",
       " 'y-meso-27',\n",
       " 'fc=1.46',\n",
       " 'g482v',\n",
       " 'transformationrelevant',\n",
       " 'p332q',\n",
       " 'lys3326tersdhb',\n",
       " '4337123',\n",
       " 'p53-/-/p210',\n",
       " 'bjornsti',\n",
       " 'allele34,35',\n",
       " 'theiii',\n",
       " 'igf-i-induced',\n",
       " '2.1289126',\n",
       " 'e40t',\n",
       " 'bc012846.1',\n",
       " 'cellssupplementalimportantly',\n",
       " 'vigers',\n",
       " 'anassays',\n",
       " 'brim-311',\n",
       " 'rp11-295i5',\n",
       " 'ttce2',\n",
       " 'idh2,15',\n",
       " 'detectedlanes',\n",
       " 'bcr-ablt315itransformed',\n",
       " 'il-9r/c',\n",
       " 'esr-y537n',\n",
       " 'melanomasl',\n",
       " 'g418-positive',\n",
       " 'primersthat',\n",
       " 'wee1hu',\n",
       " 'hrp-coupled',\n",
       " 'wild-type-likemutant',\n",
       " '4/490',\n",
       " 'leu298',\n",
       " 'collagen-5a1',\n",
       " 'p.q9x',\n",
       " 'hetzer',\n",
       " 'gradeprognostic',\n",
       " 'wm278-gfp',\n",
       " 'spen-specific',\n",
       " 'contexts.conclusionin',\n",
       " 'e02641959224774121743269288',\n",
       " 'tmm18',\n",
       " 'g662eschematic',\n",
       " 'h585d',\n",
       " 'gagtcatcaattttattctgactgatcc',\n",
       " 'myc5a',\n",
       " '37yrs',\n",
       " 'pro261-induced',\n",
       " 'ng/lin',\n",
       " 'post-enucleated',\n",
       " 's1py-bound',\n",
       " 'v281i',\n",
       " 'disease0',\n",
       " 'bartletts',\n",
       " 'cellssupplementaryoverall',\n",
       " 'modifications.9,10',\n",
       " '5-caaga-acagcaacgagtaccg-3',\n",
       " 'ostium-primum',\n",
       " 'female3823',\n",
       " 'pms2-interactive',\n",
       " 'aacggtaccaaggctgagaa',\n",
       " 'texasred-conjugated',\n",
       " 'lskeffect',\n",
       " 'mfinal',\n",
       " 'spop-mediated',\n",
       " 'h-galactosidase',\n",
       " '70.169.2',\n",
       " 'esmedical',\n",
       " 'budssuch',\n",
       " 'reorganizedin',\n",
       " 'r246w',\n",
       " '32p-4e-bp1',\n",
       " \"5'-aatgcccat\",\n",
       " 'anti-phospho-c-met',\n",
       " '68/f',\n",
       " 'trastuzumab.25',\n",
       " 'snu-886',\n",
       " 'cplcknditkrslqestrfsqlveellkii',\n",
       " 'htertnot',\n",
       " '1.282.17',\n",
       " 'satoko',\n",
       " '5-aaggtgttgcaatccccagc-3',\n",
       " 'matrix.41',\n",
       " 'mutationssupplementarythese',\n",
       " 'gccagcattttagcattacttc',\n",
       " 'a864t',\n",
       " 'indouble',\n",
       " 't49.2/t49.3',\n",
       " '9206s',\n",
       " 'promoter-ttadriven',\n",
       " 's1/kh',\n",
       " 'interaction17',\n",
       " 'unknown/na14.813.223.8',\n",
       " 'p110-d964a',\n",
       " 'jak3-stat3',\n",
       " 'extendmr',\n",
       " 'y-733',\n",
       " 'pbabe-zeo-nrasg12d',\n",
       " 'mek1c121s-expressing',\n",
       " 'anti-pdgfc',\n",
       " 'asp-220',\n",
       " 'g1567d',\n",
       " '211403',\n",
       " 'polyacryiamide',\n",
       " 'p.val600_lys601',\n",
       " 'jim3',\n",
       " 'statethe',\n",
       " 'staphylococcusbetween',\n",
       " \"5'-attacacagtatcctcgaca-3\",\n",
       " 'ha-11',\n",
       " 'u3-1287',\n",
       " 'di7s2sorepeating',\n",
       " 'hadshould',\n",
       " 'y1003x',\n",
       " 'sd/agar',\n",
       " 'differentiation,3',\n",
       " 'staalesen',\n",
       " '70800',\n",
       " 'activity.23,24',\n",
       " 'againstwap-cre',\n",
       " 'g162rnormalnormaldeficientdeleterious0.00pathogenic',\n",
       " 'gly719x',\n",
       " 'mds19',\n",
       " 'd464g',\n",
       " 'mode.14',\n",
       " 'y88c',\n",
       " 'asp770_asn771insmetalathrpro',\n",
       " 'oxiod',\n",
       " '50target',\n",
       " 'q1496h',\n",
       " 'c135r',\n",
       " 'crizotinibc',\n",
       " 'sequencing.10',\n",
       " 'rotterdam.26',\n",
       " 'defectao105yes',\n",
       " 't198a/g',\n",
       " '14.981.7',\n",
       " 'locatedleading',\n",
       " 'sex.1984',\n",
       " 'capacityrestoring',\n",
       " '20-mmol/l',\n",
       " 'butdownloadin',\n",
       " 'makita',\n",
       " 'syndromeassociated',\n",
       " 'newman-keuls',\n",
       " '5-gatggtgggggccctcctctt-3',\n",
       " 'mutagenesismutants',\n",
       " 'analysisimmunohistochemical',\n",
       " 'rs3f',\n",
       " 'that.altogether',\n",
       " '030/50',\n",
       " 'conformationgrowthcodon',\n",
       " 'gln184stop',\n",
       " 'arrest.the',\n",
       " '3.40+0.12',\n",
       " 'a328p6ntp91l',\n",
       " 'status22',\n",
       " 'sample.sanger',\n",
       " 'pocket4a',\n",
       " 's71w',\n",
       " 'sdcbp2',\n",
       " 'm114',\n",
       " 'phospho-ros1',\n",
       " 'nm_007817',\n",
       " 'ntgtg',\n",
       " 'lossoffunction',\n",
       " 'eachaffected',\n",
       " '71f/57stomach453spvery',\n",
       " 'anti-cd4',\n",
       " '144k',\n",
       " 'scalia',\n",
       " 'microcystein-lr',\n",
       " 'repairatpase',\n",
       " 'me6',\n",
       " 'erk.of',\n",
       " 'tfsearch',\n",
       " 'xeds/eels',\n",
       " 'her2+/pi3k-mutant',\n",
       " 's6o',\n",
       " 'p5-specic',\n",
       " 'l65f',\n",
       " 'antibody.of',\n",
       " 'a3062g',\n",
       " 'malignancy13',\n",
       " 'locusb',\n",
       " 'hkr1',\n",
       " 'supplemental5a-b',\n",
       " '2chighwire_math',\n",
       " 'sc1747',\n",
       " 'turnhout',\n",
       " 'nrap',\n",
       " 'wasthree',\n",
       " '413641513',\n",
       " 'metnm_000245c2646tp814sgermline1/0',\n",
       " 'binitial',\n",
       " 'tgcggacagg',\n",
       " 'carcinoma987512.512.5040',\n",
       " 'lsab+system-hrp',\n",
       " 'ret-ntrk1',\n",
       " 'nordenstadt',\n",
       " 'molecularn114s',\n",
       " 'rotolib2.aa',\n",
       " 'backgroundcould',\n",
       " 'n105kthe',\n",
       " 'y110c',\n",
       " '395one',\n",
       " 'complexes129',\n",
       " 'sulfateinduced',\n",
       " 'shrna-smad2',\n",
       " 'c-dx',\n",
       " 'trii/alk-x',\n",
       " 'sa-b',\n",
       " 'h920',\n",
       " 'referencingca',\n",
       " 'ctm-5990506',\n",
       " 'er+breast',\n",
       " '33-kda',\n",
       " 'alkf1178l',\n",
       " '5tgcaaggtggagcgattctg',\n",
       " 'withclosure',\n",
       " 'substitutionb',\n",
       " 't-lymphotropic',\n",
       " 'pa30824101none0',\n",
       " 'lm3d-induced',\n",
       " 'vomiting30',\n",
       " 't479pvitro',\n",
       " '5-acacgtccccatctgaag-3',\n",
       " 'a279vprotein',\n",
       " 'br67',\n",
       " 'decribed.8',\n",
       " 'complementarygene',\n",
       " 'hoxd8',\n",
       " '5-uauauuuauauauuagacgdgdg-3',\n",
       " 'skp1or',\n",
       " 'imatinib.the',\n",
       " ...}"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "words_not_updated"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-24T18:30:03.245404Z",
     "start_time": "2017-09-24T18:30:02.935839Z"
    }
   },
   "outputs": [],
   "source": [
    "np.save(\"processed/stage1/biolab_updated_wvs.npy\", corpus_word_vectors)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "## gcloud tensorboard serving"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-08-24T03:59:12.315180Z",
     "start_time": "2017-08-24T03:59:12.159412Z"
    },
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "dataset_corpus_words_list = np.load(\"dataset_corpus_words_list.npy\")\n",
    "corpus_word_vectors = np.load(\"corpus_word_vectors.npy\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-08-24T03:59:12.771702Z",
     "start_time": "2017-08-24T03:59:12.767507Z"
    },
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "tb_vocab_size = 10000"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "local_tb_dir = \"/home/bicepjai/Projects/ml-compete/kaggle/mskrct/data_prep_2_ft/model_wv_visualize/gcloud/\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-08-24T04:36:28.978027Z",
     "start_time": "2017-08-24T04:36:28.959764Z"
    },
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "with open(local_tb_dir+\"/vocab.tsv\", \"wb\") as fp:\n",
    "    wr = csv.writer(fp, delimiter='\\n')\n",
    "    wr.writerow(dataset_corpus_words_list[:tb_vocab_size])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "for http://projector.tensorflow.org/ vectors need to be in tsv form"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-08-24T03:52:44.132601Z",
     "start_time": "2017-08-24T03:52:44.127629Z"
    },
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "# np.savetxt(\"model_wv_visualize/word_vectors.tsv\",corpus_word_vectors[:tb_vocab_size], delimiter='\\t')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "write to checkpoint file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-08-24T04:18:31.688939Z",
     "start_time": "2017-08-24T04:18:31.410071Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "rm: cannot remove '/home/bicepjai/Projects/ml-compete/kaggle/mskrct/data_prep_2_ft/model_wv_visualize/checkpoint': No such file or directory\r\n"
     ]
    }
   ],
   "source": [
    "!rm $local_tb_dir/checkpoint\n",
    "!ls $local_tb_dir"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-08-24T04:18:51.849068Z",
     "start_time": "2017-08-24T04:18:51.568000Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(10000, 200)\n"
     ]
    }
   ],
   "source": [
    "from word2vec import visualize_embeddings_in_tensorboard\n",
    "visualize_this_embedding = corpus_word_vectors[:tb_vocab_size]\n",
    "print visualize_this_embedding.shape\n",
    "# path for gcloud tensorboard\n",
    "metadata_path = \"/home/bicepjai/projects/tb_visual/vocab.tsv\"\n",
    "# metadata_path = \"/home/bicepjai/Projects/ml-compete/kaggle/mskrct/data_prep_2_ft/model_wv_visualize/vocab.tsv\"\n",
    "visualize_embeddings_in_tensorboard(visualize_this_embedding, metadata_path, local_tb_dir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-08-24T04:18:53.075361Z",
     "start_time": "2017-08-24T04:18:53.067692Z"
    },
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "checkpoint_txt = \"model_checkpoint_path: \\\"/home/bicepjai/projects/tb_visual/visual_embed.ckpt-1\\\"\\n\\\n",
    "all_model_checkpoint_paths: \\\"/home/bicepjai/projects/tb_visual/visual_embed.ckpt-1\\\"\"\n",
    "with open(local_tb_dir+\"/checkpoint\",\"w\") as f:\n",
    "    f.seek(0)\n",
    "    f.truncate()\n",
    "    f.write(checkpoint_txt)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# FastText Vectors"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### fasttext commands used"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-20T23:38:07.217912Z",
     "start_time": "2017-09-20T23:38:07.211306Z"
    }
   },
   "source": [
    "fasttext skipgram -minCount 1 -dim 200 -epoch 10 -input corpus_text_for_fast_text.txt -output ft_wvs_200d_10e\n",
    "\n",
    "fasttext cbow -minCount 1 -dim 200 -epoch 10 -input corpus_text_for_fast_text.txt -output ft_wvs_200d_10e"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### reading ft vectors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-26T05:53:36.325814Z",
     "start_time": "2017-09-26T05:53:36.322630Z"
    }
   },
   "outputs": [],
   "source": [
    "fasttext_vec_file = \"processed/stage2/pretrained_word_vectors/ft_sg_200d_10e.vec\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-26T05:53:37.289186Z",
     "start_time": "2017-09-26T05:53:36.795990Z"
    }
   },
   "outputs": [],
   "source": [
    "ft_lines = None\n",
    "with open(fasttext_vec_file,\"r\") as f:\n",
    "    ft_lines = f.readlines()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-26T05:53:37.860361Z",
     "start_time": "2017-09-26T05:53:37.854348Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "362933 200\n",
      "\n",
      "<class 'list'> 362934\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(362933, 200)"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(ft_lines[0])\n",
    "print(type(ft_lines), len(ft_lines))\n",
    "ft_shape = tuple([int(i.strip()) for i in ft_lines[0].split()])\n",
    "ft_shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-26T05:53:39.969236Z",
     "start_time": "2017-09-26T05:53:39.960969Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "201\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'the 0.027251 -0.018114 0.0096083 0.076723 -0.29626 0.05729 0.17298 0.097187 0.10251 0.16822 -0.40156 0.12471 0.11843 0.069956 0.031858 -0.20362 0.18791 -0.20113 -0.20219 0.002323 -0.30366 0.16106 -0.091842 0.028771 -0.082447 0.18842 0.02471 -0.10553 -0.28138 0.044856 -0.041988 -0.031351 0.25131 -0.18547 0.23941 -0.18438 0.12292 -0.039016 0.075311 0.028379 0.024822 -0.069827 0.054794 0.19297 0.19053 -0.15749 0.21978 -0.003489 -0.15063 -0.018887 0.05638 0.1385 0.10112 0.023256 -0.22436 -0.27619 -0.047866 -0.053595 0.010177 0.059109 0.078079 0.080721 -0.017329 0.29334 0.19386 0.1279 0.04759 0.11951 -0.37341 -0.028312 0.0086509 0.021498 0.049069 0.094658 -0.076768 0.00541 -0.0013258 -0.062564 -0.092488 0.15718 0.21148 0.11005 0.088614 0.17268 0.057106 -0.0044174 -0.0072504 0.01389 -0.067416 -0.18715 -0.009639 0.12991 0.11389 -0.0017624 0.020464 -0.19809 -0.038933 -0.016631 -0.24906 0.012139 0.21376 0.14972 -0.16496 0.3738 -0.095022 0.10864 -0.058577 -0.034298 0.0021112 -0.010114 -0.024814 0.027078 0.036302 0.10004 -0.35396 -0.064597 0.0010858 -0.0049044 -0.094081 0.096904 -0.0046191 0.074286 0.09301 -0.28307 -0.15225 0.064754 0.094255 0.20833 -0.088393 0.1362 0.11452 -0.076745 0.26119 0.068646 0.067695 -0.069496 -0.047141 0.11597 -0.18205 -0.074642 -0.0431 -0.15549 0.27262 -0.012248 0.067552 0.12357 -0.027967 -0.24034 0.21146 -0.030294 -0.16886 -0.36566 -0.027902 -0.04372 0.079934 -0.10144 -0.029423 -0.06038 -0.22478 -0.19269 -0.068223 -0.016667 0.3038 0.012443 -0.42416 0.077392 -0.19895 0.016593 0.051294 -0.0079492 0.11613 -0.13423 0.19772 0.056557 -0.023173 -0.15394 0.078205 -0.17027 -0.26604 0.098637 -0.036921 0.18138 0.20576 -0.17695 0.15974 -0.059677 0.017603 -0.21435 -0.036402 -0.085582 0.06715 0.080574 -0.21038 0.024121 -0.18857 -0.0853 -0.27693 -0.00081868 0.02937 0.040705 \\n'"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(len(ft_lines[1].split()))\n",
    "ft_lines[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-26T05:53:40.960601Z",
     "start_time": "2017-09-26T05:53:40.956120Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "362933"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ft_vocab_size=ft_shape[0]\n",
    "ft_vocab_size"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-26T05:53:44.808768Z",
     "start_time": "2017-09-26T05:53:42.104177Z"
    }
   },
   "outputs": [],
   "source": [
    "ft_word_vectors = np.random.randn(ft_vocab_size, ft_shape[1])\n",
    "ft_words = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-26T05:54:08.362297Z",
     "start_time": "2017-09-26T05:53:44.810105Z"
    }
   },
   "outputs": [],
   "source": [
    "for i, line in enumerate(ft_lines[1:]):\n",
    "    str_list =line.split()\n",
    "    ft_words.append(str_list[0].strip())\n",
    "    vec = np.array([np.float(f) for f in str_list[1:]])\n",
    "    ft_word_vectors[i] = vec"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-26T05:54:08.367479Z",
     "start_time": "2017-09-26T05:54:08.363663Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(362933, 200)"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ft_word_vectors.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-26T05:54:08.477302Z",
     "start_time": "2017-09-26T05:54:08.369141Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['k2950n1.6.2501.857.151010r2108h2.722.55.521.745.481010s1733f8.88.7608.121.33108g1529r4.123.9708.091.23108i2285v5.492.4107.97.86107l1019v3.81.543.186.462.86106a75p3.52.312.516.342.20106t3349a.711.233.45.352.22105r1190w1.991.172.125.271.88105p1819s4.47.771.535.231.70105t630i4.64.5805.231.68105g1771d4.97.1505.121.33105k1690n4.49.5305.021.05105s1172l3.36.56.894.816.48104q2384k1.73.35.264.796.11104c554w1.31.33.74.715.15104d2312v.062.731.914.573.76104ivs26-20ct1.33.71.534.483.04104g602r.89.173.594.32.02104e462g1.39.641.823.846,960n56t.97.112.963.826,666h2074n3.5.3103.816,513r2973c1.39.262.633.755,685c3198r1.651.22.873.745,541i1929v.371.931.33.593,914n1228d.351.182.043.573,726h1918y2.31.2503.553,552v1306i1.891.21.373.472,979y3098h3.45.71.73.462,892v2969m2.06.82.573.452,806v894i.91.011.483.392,440i1349t.642.7203.372,320q1396r.021.741.63.362,280n2113s1.03.991.233.241,749r2842h.851.231.123.21,574ivs25+9ac.481.011.73.191,553v3079i.82.3803.191,545ivs11-20ta.83.232.523.111,288r2888c1.111.35.643.11,246t2250a1.3.291.9831,005f1524v1.991.18.172.99982l1904v.331.241.422.99976t582p.322.6402.96910d3170g1.54.071.422.89772d1280v2.571.01.722.85714p168t1.721.0902.81646n2048i.751.28.782.81639k2729n2.21.81.382.79614d2665g.61.891.12.61406r3052q1.241.3102.55354g1194d1.41.151.242.5316p375s1.261.2402.5315v2908g1.171.2402.41256d806h.571.18.662.41255t1354m1.12.271.562.4254n1102y2.53.1402.39246y3092c2.34.11.012.22166ivs20-16cg1.71.27.782.22165c1365y.571.23.382.18150n2436i.391.32.412.12132k2411t.841.2402.08119l2396f.671.3802.04111k513r.741.2902.03108table', 'r866c7.71.762.6312.11.251012p142h8.922.16011.081.201011t1720a6.23.495.221.959.001010n810y2.427.281.241.948.771010s186y3.573.542.39.422.61109p1614s.517.121.028.644.40108v1534m4.633.9908.624.18108e597k6.551.9208.472.95108n723d1.138.9407.816.44107e1214k2.412.132.937.472.96107s1101n7.06.5506.513.23106ivs16-20ag2.573.6906.251.78106ivs8-17gt4.421.6206.041.10106p1238l5.251.31.546.021.05106y105c.834.57.125.291.94105v191i1.862.231.025.111.30105p334l.954.85.974.887.52104i1275v5.28.4904.796.16104ivs18-6ca2.142.4504.593.90104p1859r2.742.26.524.472.98104r504h3.19.571.664.281.92104s1266t1.872.3904.261.80104d67y3.23.281.084.031.06104i1044v1.542.403.948,723v1247i1.58.172.53.918,123ivs2-11delt1.492.3903.887,598ivs18-13ag.392.421.023.836,799i925l1.412.3903.86,251n473s1.292.4903.786,049ivs6+7ga1.432.16.173.755,601r1203q1.71.152.183.755,589g890v3.86.1603.75,006e842g1.222.4603.684,741m1652t1.74.091.923.573,728d369n3.77.2203.563,593d369del', '1.312.2103.523,328v1736a1.52.1.253.352,219k862e2.93.19.573.312,059r1028h1.232.0803.312,045e143k2.3.121.093.271,871d642h.75.072.493.161,455ivs2-13cg1.161.9103.071,166d1546y.562.4703.041,088a622v3.43.4502.98963v1804d2.61.57.882.92828ivs12+10gc.472.4302.9803i1405v.422.4602.88752q804h1.25.331.952.86729k1109n.512.3202.82664ivs11-11tc.372.4502.81648q155e.272.4502.72524p1637l1.83.4.462.69493t1349m2.71.0402.66462i1858l.142.502.65444n1468h.042.4902.53337m297i02.502.51322e1682k02.502.5317ivs21-8ct1.89.13.742.5313a280g.062.4902.43266ivs17-9at.092.4302.34218e1419q2.37.0602.31203f1662s.162.4502.28192h1402y.072.45.292.22165r1751q2.66.19.32.17150i124v2.21.0402.17147ivs15-7ct.24.121.972.1125n132k2.32.2302.09123d420y2.16.0702.09123l668f2.85.73.12.02105m1361l2.09.0702.02104brca2', 'typeazd6244pd98059pd0325901u0126gsk1120212bsp600125bi78d3as601245sb203580ecc1endometrial3.1/na5.5/na7.1/na0.2/na1.4/nana/nana/nana/nana/nahec108endometrial9.8/nana/na5.5/na0.6/na0.6/nana/na4.1/na2.3/nana/nahec265endometrialna/nana/nana/na1.4/nana/nana/nana/na1.1/nana/nasngiiendometrial0.1/na0.4/na2.1/na0.6/7.81.3/nana/na1.1/na0.49/9.8na/naovk18ovarian0.03/1.20.03/2.70.04/1.80.009/0.70.04/9.50.09/7.10.06/8.20.08/6.4na/naic25', '1.13.071.232.43268ivs6-3cg2.28.07.002.35225ivs15+1ga2.11.19.002.30199ivs17+1ga2.21.09.002.29196t1685i2.10.04.002.15140i1766s1.11.16.872.14139g1738r1.94.12.002.06114t1685a2.00.03.002.03107ivs6-1gc1.84.01.001.8572m1689r1.49.17.001.6746s1715r1.56.09.001.6545ivs12+2del21insa1.46.10.001.5636m18t1.41.09.001.4931a1623g', 'cellmelanomal.vmphomamalthiv-positivelunggliomasgliohlaslomasaslrocytomasgliohlaslomasastrocytomasgliohlaslomasaslrocytomasprostalelocidinucleotidedis126d2s393d3s1067d5s644tp53dinucleolided2si23d2s136o3si067diis922tp53ap', 'ins20.9pd3052mansmokeradenocarcinomaivl858r/s768i11.3pd8945womanneveradenocarcinomaivl858r18.0pr11462womanneveradenocarcinomaivl858r14.7sd11639womanneveradenocarcinomaiiibl858r14.1sd14260womanneveradenocarcinomaiv19', 'wd37-1pdmet47-1w54h-1wdmet58-1wd59-1anapmet60-1wdmet61-1wd64-1anap65-1wd36-2wd37-2anap47-2wd54h-2wd58-2pdmet60-2wd61-2pd64-2anap65-2anap36-3wd37-3anapmet47-3wdmet54h-3wdmet58-3wdmet61-3pd64-3anap65-3anapmetin', 'a.a.g.c.g.c.a.a.g.a.t.c.c.a.g.g.t.t.c.tg.c.a.g.c.a.g.c.a.g.g.c.a.g.a.t.g.a.t.g.c.a.g.a.g.g.a.g.c.g.a.g.c.t.g.a.g.c.g.c.ct.c.c.a.g.c.a.a.ga.a.g.t.t.g.a.g.g.g.a.g.a.a.a.g.g.c.g.g.g.c.c.c.g.g.a.a.ac.a.g', '5.28.81.006.101,249,808r1699w4.34.26.004.6039,978g1788v2.37.421.063.857,054ivs5+3ag3.04.12.003.151,417g1706e2.35.13.292.77589ivs19-12ga4.52.102.062.56363l1764p1.56.09.892.54350v1688del']\n"
     ]
    }
   ],
   "source": [
    "a = list(ft_words)\n",
    "a.sort(key=len, reverse=True)\n",
    "print(a[:10])\n",
    "del a"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-26T05:54:08.570044Z",
     "start_time": "2017-09-26T05:54:08.479008Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(362933, 362933)"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ft_wordidx = {w:i for i,w in enumerate(ft_words)}\n",
    "ft_vocab_size, len(ft_wordidx)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-26T05:54:08.695038Z",
     "start_time": "2017-09-26T05:54:08.571474Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1677"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(set(vocab_words) - set(ft_words))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-26T05:54:08.847867Z",
     "start_time": "2017-09-26T05:54:08.696286Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'lrp4',\n",
       " 'dfnb59',\n",
       " 'a75p',\n",
       " 'g1803a',\n",
       " 'e1682v',\n",
       " 'atp7b',\n",
       " 'fxn',\n",
       " 'r561c',\n",
       " 't417_d419delinsi',\n",
       " 'mocs1',\n",
       " 'r1726g',\n",
       " 'scn4a',\n",
       " 'pdha1',\n",
       " 'c420g',\n",
       " 'g2420c',\n",
       " 'prkn',\n",
       " 'dpys',\n",
       " 'e1051k',\n",
       " 'v384d',\n",
       " 'prpf31',\n",
       " 'plod2',\n",
       " 'h1805p',\n",
       " 's860l',\n",
       " 'g776delinslc',\n",
       " 's241y',\n",
       " 'washc5',\n",
       " 'r112g',\n",
       " 'siae',\n",
       " 'r342w',\n",
       " 'q2416*',\n",
       " 'ctns',\n",
       " 'rtn4r',\n",
       " 'e685v',\n",
       " 'q1811r',\n",
       " 'a41s',\n",
       " 'a500t',\n",
       " 'm1663l',\n",
       " 'gdi1',\n",
       " 'gnmt',\n",
       " 'd603g',\n",
       " 'adgrg1',\n",
       " 'mtmr14',\n",
       " 'slc12a3',\n",
       " 'r421*',\n",
       " 'l37p',\n",
       " 'e1356g',\n",
       " 'h773dup',\n",
       " 'g598a',\n",
       " 's216f',\n",
       " 'p2417a',\n",
       " 'cfl2',\n",
       " 'y599_d600inspapqimststlisenmnia',\n",
       " 'x582_splice',\n",
       " 'cngb3',\n",
       " 'bbs4',\n",
       " 'vangl2',\n",
       " 'r505l',\n",
       " 'bckdhb',\n",
       " 'd737v',\n",
       " 'i2675v',\n",
       " 'arsb',\n",
       " 'r331p',\n",
       " 'i1718t',\n",
       " 'a2351g',\n",
       " 'l1844r',\n",
       " 'r2336p',\n",
       " 't244_i245inscpt',\n",
       " 'r280g',\n",
       " 'a72d',\n",
       " 'grxcr1',\n",
       " 'cldn14',\n",
       " 'aptx',\n",
       " 'phex',\n",
       " 'slc19a3',\n",
       " 'r100*',\n",
       " 'hps3',\n",
       " 'gjb1',\n",
       " 's453fs*',\n",
       " 'krt86',\n",
       " 'atg16l1',\n",
       " 'pafah1b1',\n",
       " 'a60v',\n",
       " 't1685a',\n",
       " 'm1663k',\n",
       " 'ftcd',\n",
       " 'hip1-pdgfrb',\n",
       " 'l747_a750del',\n",
       " 'n480del',\n",
       " 'a57v',\n",
       " 'dock8',\n",
       " 'e554_k558del',\n",
       " 'pitx3',\n",
       " 'm1v',\n",
       " 'q546l',\n",
       " 'p838l',\n",
       " 'r1627',\n",
       " 'a312p',\n",
       " 'dysf',\n",
       " 'k376n',\n",
       " 'dmgdh',\n",
       " 'f537_k539delinsl',\n",
       " 's23r',\n",
       " 'gja8',\n",
       " 'sh3pxd2b',\n",
       " 'sil1',\n",
       " 'p551_m552del',\n",
       " 't216s',\n",
       " 'stxbp1',\n",
       " 'hspb3',\n",
       " 'alg3',\n",
       " 'gdap1',\n",
       " 't1151r',\n",
       " 'sucla2',\n",
       " 'e606g',\n",
       " 'sptlc1',\n",
       " 'k700r',\n",
       " 'e69g',\n",
       " 'v750e',\n",
       " 'slc25a3',\n",
       " 'g87r',\n",
       " 'p2ry12',\n",
       " 'cbx2',\n",
       " 'y113*',\n",
       " 'g196v',\n",
       " 'g106_r108del',\n",
       " 'stox1',\n",
       " 'r1190w',\n",
       " 'crb1',\n",
       " 'd1352y',\n",
       " 'r514c',\n",
       " 'y382h',\n",
       " 'cpt1a',\n",
       " 'b3glct',\n",
       " 'opa1',\n",
       " 'k550_w557del',\n",
       " 'slc26a2',\n",
       " 'lpar6',\n",
       " 'mkrn1-braf',\n",
       " 'cyba',\n",
       " 'colq',\n",
       " 'r369w',\n",
       " 'upk3a',\n",
       " 'l283_d294del',\n",
       " 'y647c',\n",
       " 'k550_v555delinsi',\n",
       " 'e545v',\n",
       " 'kcnj10',\n",
       " 'k806a',\n",
       " 'v118d',\n",
       " 'krt83',\n",
       " 'slc6a5',\n",
       " 'ubiad1',\n",
       " 'alg12',\n",
       " 'slc37a4',\n",
       " 'hpgd',\n",
       " 'w531c',\n",
       " 'q276*',\n",
       " 'e120q',\n",
       " 'x475_splice',\n",
       " 's1088f',\n",
       " 'chst14',\n",
       " 'zfyve27',\n",
       " 'r194w',\n",
       " 'krt12',\n",
       " '533_534del',\n",
       " 'hmcn1',\n",
       " 'p286s',\n",
       " 'nipa1',\n",
       " 'slc22a4',\n",
       " 'cyp4v2',\n",
       " 'r621c',\n",
       " 'v3079i',\n",
       " 'l611_e612inscssdneyfyvdfreyeydlkwefprenl',\n",
       " 'delta-ntrk1',\n",
       " 'c450_k451insmiewmi',\n",
       " 'n639k',\n",
       " 'r450h',\n",
       " 'p490_q494del',\n",
       " 'agps',\n",
       " 'y24c',\n",
       " 'robo3',\n",
       " 'adgrv1',\n",
       " 'gm2a',\n",
       " 'pomgnt1',\n",
       " 'stra6',\n",
       " 'y3092c',\n",
       " 'q144p',\n",
       " 'r659l',\n",
       " 'g6pc',\n",
       " 'd1010n',\n",
       " 'd568n',\n",
       " 'tspan12',\n",
       " 'i111a',\n",
       " 'r2842c',\n",
       " 'a1701p',\n",
       " 'trpm7',\n",
       " 'i254t',\n",
       " 'r544s',\n",
       " 'd3170g',\n",
       " 'd402y',\n",
       " 'trex1',\n",
       " 'mbtps2',\n",
       " 'v762m',\n",
       " 's241a',\n",
       " 'aldh5a1',\n",
       " 'kcne3',\n",
       " 'cc2d2a',\n",
       " 'galns',\n",
       " 'wdr35',\n",
       " 'fut1',\n",
       " 'a767_v769del',\n",
       " 'y35c',\n",
       " 'hlcs',\n",
       " 's326r',\n",
       " 'v665a',\n",
       " 'c1qtnf5',\n",
       " 'f877l',\n",
       " 'nsdhl',\n",
       " 'matn3',\n",
       " 'e218*',\n",
       " 'a2770t',\n",
       " 'd587h',\n",
       " 'a603fs',\n",
       " 'pla2g6',\n",
       " 'v1306i',\n",
       " 'anos1',\n",
       " 'fam126a',\n",
       " 'a767_v769dup',\n",
       " 'mmaa',\n",
       " 'a41t',\n",
       " 'q429*',\n",
       " 's562l',\n",
       " 'gckr',\n",
       " 'tlr5',\n",
       " 'nhlrc1',\n",
       " 'gdf5',\n",
       " 'v559_v560del',\n",
       " 'nectin4',\n",
       " 'e265k',\n",
       " 'r380a',\n",
       " 'p29t',\n",
       " 'k483e',\n",
       " 'acsl4',\n",
       " 'a120s',\n",
       " 'l455m',\n",
       " 'g119t',\n",
       " 'slc35d1',\n",
       " 'c1365y',\n",
       " 'htra2',\n",
       " 'k459_s460delinsn',\n",
       " 'm1328i',\n",
       " 'd1010y',\n",
       " 'r515a',\n",
       " 'sf3b2',\n",
       " 'kcnv2',\n",
       " 'scn1b',\n",
       " 'cx3cr1',\n",
       " 'pou3f4',\n",
       " 'creld1',\n",
       " 'cpt2',\n",
       " 'i408v',\n",
       " 'p449_l455del',\n",
       " 'tsen54',\n",
       " 'r957q',\n",
       " 'k745_a750del',\n",
       " 'gcnt2',\n",
       " 'l97r',\n",
       " 'lcat',\n",
       " 'e60l',\n",
       " 'slc6a8',\n",
       " 'q72l',\n",
       " 'r1446g',\n",
       " 't2250a',\n",
       " 'slc6a19',\n",
       " 't599_v600insett',\n",
       " 'r479h',\n",
       " 'g173c',\n",
       " 'n2829r',\n",
       " 'slc52a3',\n",
       " 'l755m',\n",
       " 'tcn2',\n",
       " 'tbc1d24',\n",
       " 'ift122',\n",
       " 's752_i759del',\n",
       " 'cyp2r1',\n",
       " 'i251s',\n",
       " 'd887n',\n",
       " 'w559_r560del',\n",
       " 'a77p',\n",
       " 'f590g',\n",
       " 'asah1',\n",
       " 'avp',\n",
       " 'antxr1',\n",
       " 'd609e',\n",
       " 'hsf4',\n",
       " 'y1045*',\n",
       " 'e1836k',\n",
       " 'arl6',\n",
       " 'r798*',\n",
       " 'd769n',\n",
       " 'r148i',\n",
       " 'q395*',\n",
       " 'rps26',\n",
       " 'e709_t710delinsd',\n",
       " 'mapk8ip1',\n",
       " 'k467t',\n",
       " 'd646y',\n",
       " 'cstb',\n",
       " 'lpin2',\n",
       " 'x1009_splice',\n",
       " 'hesx1',\n",
       " 'r258m',\n",
       " 't3211k',\n",
       " 'kcne2',\n",
       " 'd74y',\n",
       " 'r922*',\n",
       " 'n771_h773dup',\n",
       " 'rfxank',\n",
       " 's45c',\n",
       " 'alg6',\n",
       " 'fgf14',\n",
       " 'v2969m',\n",
       " 'a598t',\n",
       " 'l301s',\n",
       " 'acat1',\n",
       " 'n826y',\n",
       " 's206c',\n",
       " 'tbx22',\n",
       " 'v769_d770insgvv',\n",
       " 'g665a',\n",
       " 'q2384k',\n",
       " 'e622q',\n",
       " 'coq8a',\n",
       " 'n1333gfs*',\n",
       " 'cul1-braf',\n",
       " 'a97g',\n",
       " 'lgi1',\n",
       " 'eif2b1',\n",
       " 'd1642h',\n",
       " 's267_d273dup',\n",
       " 'opcml',\n",
       " 'ccm2',\n",
       " 'd29h',\n",
       " 'i33del',\n",
       " 'dars2',\n",
       " 'y236s',\n",
       " 's2483n',\n",
       " 'r2842h',\n",
       " 'kcnmb1',\n",
       " 'scnn1b',\n",
       " 'e1978*',\n",
       " 'bsnd',\n",
       " 'g596c',\n",
       " 'sumf1',\n",
       " 'y1853*',\n",
       " 'r389*',\n",
       " 'g17e',\n",
       " 'n387p',\n",
       " 'g1194d',\n",
       " 't82a',\n",
       " 'e612_f613insgyvdfreyeydlkwefrprenlef',\n",
       " 'ndufs2',\n",
       " 'aldh4a1',\n",
       " 'w24s',\n",
       " 's116f',\n",
       " 'cox4i2',\n",
       " 'v104l',\n",
       " 'i1680n',\n",
       " 'l461v',\n",
       " 'd289del',\n",
       " 'i1807s',\n",
       " 'ndufaf4',\n",
       " 'p81t',\n",
       " 'e267g',\n",
       " 'a1200v',\n",
       " 'd29y',\n",
       " 'v852i',\n",
       " 'f158c',\n",
       " 'foxred1',\n",
       " 'fcgr2b',\n",
       " 'aldh3a2',\n",
       " 'r2502h',\n",
       " 'a77s',\n",
       " 'plod3',\n",
       " 'l234fs',\n",
       " 'e598_y599insdvdfreye',\n",
       " 'f136l',\n",
       " 'y1003*',\n",
       " 'atp6v0a4',\n",
       " 'y375_k455del',\n",
       " 'l485_q494del',\n",
       " 'gnptg',\n",
       " 'p1859r',\n",
       " 'plec',\n",
       " 'e144k',\n",
       " 'l1678p',\n",
       " 'trmu',\n",
       " 'adamts13',\n",
       " 'd842_i843delinsim',\n",
       " 'l1273f',\n",
       " 'r838q',\n",
       " 'tfr2',\n",
       " 'mplkip',\n",
       " 'mfn2',\n",
       " 'bbs9',\n",
       " '1_2009trunc',\n",
       " 'gnptab',\n",
       " 's142i',\n",
       " 'y553_k558del',\n",
       " 'd2312v',\n",
       " 'd806h',\n",
       " 'mcfd2',\n",
       " 'g1079d',\n",
       " 'pccb',\n",
       " 'nipal4',\n",
       " 'a1752v',\n",
       " 'e326l',\n",
       " 'alg1',\n",
       " 'r248g',\n",
       " 'f216a',\n",
       " 'pon2',\n",
       " 'efhc1',\n",
       " 'hadha',\n",
       " 'n987i',\n",
       " 'dnm1l',\n",
       " 'snta1',\n",
       " 'olr1',\n",
       " 'atf7ip-pdgfrb',\n",
       " 'r2108h',\n",
       " 'idua',\n",
       " 't2681r',\n",
       " 'gbe1',\n",
       " 'r108g',\n",
       " 'r133*',\n",
       " 'slc11a2',\n",
       " 'k558delinsnp',\n",
       " 'slc22a18',\n",
       " 'w603_e604insdreyeydlkw',\n",
       " 'n1819s',\n",
       " 'arsa',\n",
       " 'r213l',\n",
       " 'slc4a11',\n",
       " 'l915m',\n",
       " 'pnpo',\n",
       " 'bag3',\n",
       " 'q58_q59insl',\n",
       " 'v35m',\n",
       " 'd32v',\n",
       " 'fig4',\n",
       " 'hsd3b7',\n",
       " 'atp13a2',\n",
       " 'e746_s752delinsi',\n",
       " 'rxfp2',\n",
       " 'umps',\n",
       " 'h284y',\n",
       " 'k45t',\n",
       " 'neu1',\n",
       " 'v1188l',\n",
       " 'c248t',\n",
       " 'p4309a',\n",
       " 'y234n',\n",
       " 't488_p492del',\n",
       " 'e632_l633del',\n",
       " 'p1776s',\n",
       " 'e571a',\n",
       " 's279y',\n",
       " 'l49h',\n",
       " '2010_2471trunc',\n",
       " 'cox15',\n",
       " 'apol1',\n",
       " 's584l',\n",
       " 't4511i',\n",
       " 'ecm1',\n",
       " 'e160*',\n",
       " 'hmgcl',\n",
       " 'arfgef2',\n",
       " '385_418del',\n",
       " 'vps33b',\n",
       " 'r812a',\n",
       " 'v564i',\n",
       " 'c383y',\n",
       " 'r462i',\n",
       " 'y599_d600insglyvdfreyey',\n",
       " 'clcnka',\n",
       " 't286i',\n",
       " 'n553s',\n",
       " 'd1778g',\n",
       " 'xylt2',\n",
       " 'tufm',\n",
       " 'cpn1',\n",
       " 'abcg8',\n",
       " 'r1209w',\n",
       " 'm391r',\n",
       " 'i99m',\n",
       " 'papss2',\n",
       " 'p375s',\n",
       " 'v677i',\n",
       " 'eif2b3',\n",
       " 'q984k',\n",
       " 'rnaset2',\n",
       " 'acads',\n",
       " 'dpagt1',\n",
       " '596_619splice',\n",
       " 'litaf',\n",
       " 't319del',\n",
       " 'brwd3',\n",
       " 'kcnj5',\n",
       " 's1670a',\n",
       " 'e746_t751insip',\n",
       " 'rbm20',\n",
       " 's102f',\n",
       " 'q579_l581del',\n",
       " 'y220d',\n",
       " 'l2721h',\n",
       " 'r258c',\n",
       " 'kcnj18',\n",
       " 'gamt',\n",
       " 'a146p',\n",
       " 'r80p',\n",
       " 'd390y',\n",
       " 'cpox',\n",
       " 'y3098h',\n",
       " 'atf7ip-jak2',\n",
       " 'w802*',\n",
       " 'e719k',\n",
       " 'yars',\n",
       " 'n238s',\n",
       " 'v294m',\n",
       " 'pcca',\n",
       " 'p551_w557delinsl',\n",
       " 'pepd',\n",
       " 'f568fs',\n",
       " 'n659r',\n",
       " 'e35*',\n",
       " 'gucy2d',\n",
       " 'l576del',\n",
       " 'phka1',\n",
       " 'r183p',\n",
       " 'sar1b',\n",
       " 'e746_s752delinsv',\n",
       " 'h355m',\n",
       " 'w1502a',\n",
       " 'ppargc1b',\n",
       " 'e311_k312del',\n",
       " 'gch1',\n",
       " 'l145r',\n",
       " 'm552_k558del',\n",
       " 's1841r',\n",
       " 'h1918y',\n",
       " 'q2405rfs*17',\n",
       " 't80r',\n",
       " 'd1851e',\n",
       " 'n549s',\n",
       " 'secisbp2',\n",
       " 'rspo4',\n",
       " 'd355e',\n",
       " 'mpdu1',\n",
       " 'cdan1',\n",
       " 'dtna',\n",
       " 'n1125i',\n",
       " 'a113_splice',\n",
       " 'slc25a12',\n",
       " 'e1322*',\n",
       " 'd842_h845del',\n",
       " 'crx',\n",
       " 'r5q',\n",
       " 'sc5d',\n",
       " 's302g',\n",
       " 't599_v600inseat',\n",
       " 'r592h',\n",
       " 'mlph',\n",
       " 'mefv',\n",
       " 'f53i',\n",
       " 'q123r',\n",
       " 'h2428q',\n",
       " 'sgce',\n",
       " 'ndufaf5',\n",
       " 'y598c',\n",
       " 'm1i',\n",
       " 'd816a',\n",
       " 'g81d',\n",
       " 'a128d',\n",
       " 'l122r',\n",
       " 'a197t',\n",
       " 'n564_y578del',\n",
       " 'myo5b',\n",
       " 'znf592',\n",
       " 'prkag3',\n",
       " 'm1_e165del',\n",
       " 'bbs1',\n",
       " 'bcs1l',\n",
       " 'q1064r',\n",
       " 'k1299e',\n",
       " 't574_r588delinsl',\n",
       " 'l1301r',\n",
       " 'h773inslgnp',\n",
       " 'p1502l',\n",
       " 'abcd1',\n",
       " 's109p',\n",
       " 'f1662s',\n",
       " 'slurp1',\n",
       " 'r487*',\n",
       " 'r1608s',\n",
       " 'serpina7',\n",
       " 's1424c',\n",
       " 'spink5',\n",
       " 'igh-fgfr3',\n",
       " 's1722f',\n",
       " 'htra1',\n",
       " 'sptbn1-pdgfrb',\n",
       " 'etfdh',\n",
       " 'p38l',\n",
       " 'g914r',\n",
       " 'ndufaf3',\n",
       " 'd842_m844del',\n",
       " 'w714*',\n",
       " 'slc6a20',\n",
       " 'prkra',\n",
       " 'y568_l576delinsvn',\n",
       " 'a2425t',\n",
       " 'rp2',\n",
       " 'h662r',\n",
       " 'q201h',\n",
       " 'p28s',\n",
       " 'r1189*',\n",
       " 'fgfr1op1-fgfr1',\n",
       " 'pitx1',\n",
       " 'l1574p',\n",
       " 'v1676d',\n",
       " 'rdh12',\n",
       " 'r2888c',\n",
       " 's36y',\n",
       " 'tubb1',\n",
       " 'slc7a7',\n",
       " 'y646s',\n",
       " 'slc17a5',\n",
       " 'tmie',\n",
       " 'n581t',\n",
       " 'foxi1',\n",
       " 'k28m',\n",
       " 'e14*',\n",
       " 'arms2',\n",
       " 'ece1',\n",
       " 'gdf1',\n",
       " 'akap10',\n",
       " '-',\n",
       " 'w742l',\n",
       " 'tirap',\n",
       " 'g292r',\n",
       " 'r335*',\n",
       " 'acadl',\n",
       " 'v747l',\n",
       " 'r79p',\n",
       " 'pla2g4a',\n",
       " 's280f',\n",
       " 'klf11',\n",
       " 'lhx3',\n",
       " 'd301n',\n",
       " 'rgs9',\n",
       " 's257w',\n",
       " 'r24p',\n",
       " 'k830r',\n",
       " 'r1563s',\n",
       " 's24f',\n",
       " 'v197l',\n",
       " 'khk',\n",
       " 'a97v',\n",
       " 'pank2',\n",
       " 'i32del',\n",
       " 'n2436i',\n",
       " 'flnb',\n",
       " 'slc25a22',\n",
       " 'i49s',\n",
       " 'r2502c',\n",
       " 'a1685s',\n",
       " 'r226*',\n",
       " 'd1692h',\n",
       " 'lipa',\n",
       " 'ndufs8',\n",
       " 'chst3',\n",
       " 'd493a',\n",
       " 'r640g',\n",
       " 'r680*',\n",
       " 'g253c',\n",
       " 'g101s',\n",
       " 'atp8b1',\n",
       " 'g1286r',\n",
       " 'fbn2',\n",
       " 'kiss1r',\n",
       " 'slc11a1',\n",
       " 'slc25a19',\n",
       " 'kiaa1509-pdgfrb',\n",
       " 'x1008_splice',\n",
       " 'tulp1',\n",
       " 's123t',\n",
       " 'e106g',\n",
       " 'y406h',\n",
       " 'r174*',\n",
       " 'smpd1',\n",
       " 'a1830t',\n",
       " 'zdhhc9',\n",
       " 'plekhg5',\n",
       " 'kirrel3',\n",
       " 'manba',\n",
       " 'zfp57',\n",
       " 'prokr2',\n",
       " 'xylt1',\n",
       " 'f594_r595inssdneyfyvdf',\n",
       " 's1303n',\n",
       " 'c1385',\n",
       " '422_605trunc',\n",
       " 'g478c',\n",
       " 'r120m',\n",
       " 'r583a',\n",
       " 's786f',\n",
       " 'r174c',\n",
       " 't1852s',\n",
       " 'ahi1',\n",
       " 'znf81',\n",
       " 'rapsn',\n",
       " 's768_d770dup',\n",
       " 'g386s',\n",
       " 'p648s',\n",
       " 'ndufs7',\n",
       " 'y646n',\n",
       " 'pvt1-myc',\n",
       " 'r2318q',\n",
       " 'i491m',\n",
       " 'v561_i562inser',\n",
       " 'f468c',\n",
       " 'aloxe3',\n",
       " 'n551t',\n",
       " 'n1228d',\n",
       " 'r2327w',\n",
       " 'r1751p',\n",
       " 'ifnar2',\n",
       " 'l1854p',\n",
       " 'q25h',\n",
       " 'g67s',\n",
       " 'lrat',\n",
       " 'q110r',\n",
       " 'pdss1',\n",
       " 't73p',\n",
       " 'ube3a',\n",
       " 'bscl2',\n",
       " 'd837n',\n",
       " 'p286h',\n",
       " 'r162*',\n",
       " 'slc25a20',\n",
       " 'gpc3',\n",
       " 'w345*',\n",
       " 'sh3tc2',\n",
       " 'tmprss2-etv4',\n",
       " 'n588d',\n",
       " 'k765r',\n",
       " 'g81s',\n",
       " 'scn4b',\n",
       " 'l146r',\n",
       " 'e746_t751delinsl',\n",
       " 'bckdha',\n",
       " 'six5',\n",
       " 'n2113s',\n",
       " 't1365m',\n",
       " 'gjb3',\n",
       " 'y1003c',\n",
       " 'l221r',\n",
       " 'pygl',\n",
       " 'p531s',\n",
       " 'cnnm4',\n",
       " 'i843del',\n",
       " 'd2512g',\n",
       " 'lrp8',\n",
       " 'l2396f',\n",
       " 'r11k',\n",
       " 'slc29a3',\n",
       " 'l861f',\n",
       " 'fktn',\n",
       " 'd739y',\n",
       " 'p660t',\n",
       " 'm160v',\n",
       " 'q1756fs',\n",
       " 'flvcr2',\n",
       " 'l585i',\n",
       " 'hsd17b3',\n",
       " 'htr2c',\n",
       " 'mccc2',\n",
       " 'ndufv2',\n",
       " 'gyg1',\n",
       " 'h115q',\n",
       " 'c41y',\n",
       " 'v544_l545insavlvllviviisli',\n",
       " 'dlat',\n",
       " 'iyd',\n",
       " 'p1771r',\n",
       " 'd2512y',\n",
       " 's214t',\n",
       " 'fam20c',\n",
       " 'sumo4',\n",
       " 'm552_w557del',\n",
       " 'f154l',\n",
       " 'ubr1',\n",
       " 'rpgr',\n",
       " 'abcg5',\n",
       " 'f1088lfs*5',\n",
       " 'atg7-raf1',\n",
       " 'k2472t',\n",
       " 'q816*',\n",
       " 'man2b1',\n",
       " 'r370c',\n",
       " 'upb1',\n",
       " 'g81r',\n",
       " 'r361p',\n",
       " 'epm2a',\n",
       " 'e746q',\n",
       " 'ep300-moz',\n",
       " 'tbx1',\n",
       " 'p2415del',\n",
       " 'scnn1a',\n",
       " 'n463s',\n",
       " 'i1018w',\n",
       " 'igl-myc',\n",
       " 'st14',\n",
       " 'n486_p490del',\n",
       " 't844m',\n",
       " 'e746_s752delinsa',\n",
       " 'efemp1',\n",
       " 'mlycd',\n",
       " 't1354m',\n",
       " 'g23d',\n",
       " 'i1929v',\n",
       " 'r1758g',\n",
       " 'slc1a3',\n",
       " 'l559r',\n",
       " 'nt5c3a',\n",
       " 'pomt1',\n",
       " 'slc27a4',\n",
       " 'rgr',\n",
       " 'p1856t',\n",
       " 's387n',\n",
       " 'a36p',\n",
       " 'clcf1',\n",
       " 'v487_p492delinsa',\n",
       " 'trappc2',\n",
       " 'e554_i571del',\n",
       " 'd1344h',\n",
       " 'rai1',\n",
       " 'l1584r',\n",
       " 'r2304c',\n",
       " 'rpgrip1',\n",
       " 'n71k',\n",
       " 'clcn5',\n",
       " 'acadsb',\n",
       " 'p1812s',\n",
       " 'l747_p753delinss',\n",
       " 'efnb1',\n",
       " 'mesp2',\n",
       " 'd1778h',\n",
       " 'trpv4',\n",
       " 'q689r',\n",
       " 'w290_i291delinsc',\n",
       " 'n71i',\n",
       " 'dpyd',\n",
       " 'a888v',\n",
       " 'e275k',\n",
       " 'slc22a12',\n",
       " 't28i',\n",
       " 'h133q',\n",
       " 'nubpl',\n",
       " '256_286trunc',\n",
       " 'd423n',\n",
       " 'fgd1',\n",
       " 'l485_p490del',\n",
       " 'dnm2',\n",
       " 'dsg4',\n",
       " 'v1576e',\n",
       " 'a60r',\n",
       " 'cyp4f22',\n",
       " 'uroc1',\n",
       " 'p95s',\n",
       " 'abca12',\n",
       " 'fah',\n",
       " 'dpm3',\n",
       " 'h845_n848delinsp',\n",
       " 'p531l',\n",
       " 'faah',\n",
       " 'opa3',\n",
       " 'h1094l',\n",
       " 'l485_p490delinsf',\n",
       " 'q1396r',\n",
       " 'g2430a',\n",
       " 'slc39a4',\n",
       " 'sema3e',\n",
       " 'd384n',\n",
       " 'znf365',\n",
       " 'pex7',\n",
       " 'r2430m',\n",
       " 's501_a502dup',\n",
       " 'acadvl',\n",
       " 'e749q',\n",
       " 'guca1a',\n",
       " 'egfr-purb',\n",
       " 'c39s',\n",
       " 'btd',\n",
       " 'v220f',\n",
       " 'pklr',\n",
       " 'v60m',\n",
       " 'chmp4b',\n",
       " 'r282p',\n",
       " 'd1071n',\n",
       " 'cacna1f',\n",
       " 's241t',\n",
       " 'syt6',\n",
       " 'k666m',\n",
       " 'mat1a',\n",
       " '?',\n",
       " 'pou6f2',\n",
       " 'r304*',\n",
       " 't75m',\n",
       " 'kcnq2',\n",
       " 'best1',\n",
       " 's70fsx93',\n",
       " '981_1028splice',\n",
       " 'npc1',\n",
       " 'l301f',\n",
       " 'k641n',\n",
       " 'n1100y',\n",
       " 'tjp2',\n",
       " 'cartpt',\n",
       " 'aipl1',\n",
       " 'g67w',\n",
       " 'cilp',\n",
       " 'clcnkb',\n",
       " 's1164i',\n",
       " 'r631c',\n",
       " 't1025a',\n",
       " 'g52r',\n",
       " 'f384y',\n",
       " 'znf41',\n",
       " 'l617m',\n",
       " 'e946*',\n",
       " 'tnfrsf13b',\n",
       " 'r2418g',\n",
       " 'i668v',\n",
       " 'cd40lg',\n",
       " 'lhx4',\n",
       " 'ccnd1-igh',\n",
       " 'vps13b',\n",
       " 'l607i',\n",
       " 'ndp',\n",
       " '<SOSent>',\n",
       " 'p551_e554del',\n",
       " 't582p',\n",
       " 'r711*',\n",
       " 'r698w',\n",
       " 'mfrp',\n",
       " 'i111r',\n",
       " 'emg1',\n",
       " 'y599_d600inseyeyeyey',\n",
       " 'd357y',\n",
       " 'rp1l1',\n",
       " 'm535i',\n",
       " 'y1414c',\n",
       " 'p291qfs*51',\n",
       " 'k575m',\n",
       " 'tnfrsf11b',\n",
       " 'dnaaf1',\n",
       " 'bbs2',\n",
       " 'd29n',\n",
       " 'i563_l576del',\n",
       " 'alg9',\n",
       " 'v555_l576del',\n",
       " 'rp1',\n",
       " 'kcnj2',\n",
       " 'c1483w',\n",
       " 'kcnma1',\n",
       " 'g583e',\n",
       " 'c554w',\n",
       " 'd600_l601insfreyeyd',\n",
       " 'v600d_k601insfglat',\n",
       " 'l747_t751delinsp',\n",
       " 's746fs',\n",
       " 'pex5',\n",
       " 't574instqlpyd',\n",
       " 'unc13d',\n",
       " 'l493v',\n",
       " 'abca4',\n",
       " 'i18v',\n",
       " 'r886w',\n",
       " 'h284p',\n",
       " 'n542_e543del',\n",
       " 'm168t',\n",
       " 'slc10a2',\n",
       " 'g93w',\n",
       " 'dync2h1',\n",
       " 'x434_splice',\n",
       " 'guca1b',\n",
       " 'tspan7',\n",
       " 'bbs12',\n",
       " 'agrp',\n",
       " 'prpf3',\n",
       " ...}"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "set(vocab_words) - set(ft_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-24T19:41:30.043920Z",
     "start_time": "2017-09-24T19:41:29.856686Z"
    }
   },
   "outputs": [
    {
     "ename": "AssertionError",
     "evalue": "fast text some vectors doesn't match dimensions200 != 20",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-80-c83bc136c868>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mglobal_utils\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0mfasttext_vec_file\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"/home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/ft_cbow_200d_20e.vec\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mwvs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mglobal_utils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_corpus_wvs_from_ft\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfasttext_vec_file\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m200\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvocab_words\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      5\u001b[0m \u001b[0mwvs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/Projects/dsotc/lib/global_utils.py\u001b[0m in \u001b[0;36mget_corpus_wvs_from_ft\u001b[0;34m(fasttext_vec_file, dim, vocab_words)\u001b[0m\n\u001b[1;32m     43\u001b[0m       \u001b[0mword\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstr_list\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstrip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     44\u001b[0m       \u001b[0mvec\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfloat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mf\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mstr_list\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 45\u001b[0;31m       \u001b[0;32massert\u001b[0m \u001b[0mdim\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvec\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"fast text some vectors doesn't match dimensions\"\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdim\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;34m\" != \"\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvec\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     46\u001b[0m       \u001b[0mft_wvs_dict\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mword\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvec\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     47\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mAssertionError\u001b[0m: fast text some vectors doesn't match dimensions200 != 20"
     ]
    }
   ],
   "source": [
    "%autoreload\n",
    "import global_utils\n",
    "fasttext_vec_file=\"/home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/ft_cbow_200d_20e.vec\"\n",
    "wvs = global_utils.get_corpus_wvs_from_ft(fasttext_vec_file, 200, vocab_words)\n",
    "wvs.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### saving all trained fast text vectors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-24T19:56:10.850811Z",
     "start_time": "2017-09-24T19:56:10.685508Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "total 550348\r\n",
      "-rwxrwxr-x 1 bicepjai 563552080 Sep 24 11:30 \u001b[0m\u001b[01;32mbiolab_updated_wvs.npy\u001b[0m*\r\n"
     ]
    }
   ],
   "source": [
    "%ll /home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-24T19:59:09.795527Z",
     "start_time": "2017-09-24T19:59:09.788664Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "352220"
      ]
     },
     "execution_count": 103,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(vocab_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-24T20:00:33.263769Z",
     "start_time": "2017-09-24T20:00:33.179714Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "file doesnt exist /home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/ft_cbow_100d_20e.vec\n",
      "file doesnt exist /home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/ft_cbow_200d_20e.vec\n",
      "file doesnt exist /home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/ft_cbow_300d_20e.vec\n",
      "file doesnt exist /home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/ft_sg_100d_20e.vec\n",
      "file doesnt exist /home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/ft_sg_200d_20e.vec\n",
      "file doesnt exist /home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/ft_sg_300d_20e.vec\n",
      "file doesnt exist /home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/ft_cbow_100d_50e.vec\n",
      "file doesnt exist /home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/ft_cbow_200d_50e.vec\n",
      "file doesnt exist /home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/ft_cbow_300d_50e.vec\n",
      "file doesnt exist /home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/ft_sg_100d_50e.vec\n",
      "file doesnt exist /home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/ft_sg_200d_50e.vec\n",
      "file doesnt exist /home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/ft_sg_300d_50e.vec\n",
      "file doesnt exist /home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/ft_cbow_100d_100e.vec\n",
      "file doesnt exist /home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/ft_cbow_200d_100e.vec\n",
      "file doesnt exist /home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/ft_cbow_300d_100e.vec\n",
      "file doesnt exist /home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/ft_sg_100d_100e.vec\n",
      "file doesnt exist /home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/ft_sg_200d_100e.vec\n",
      "file doesnt exist /home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/ft_sg_300d_100e.vec\n"
     ]
    }
   ],
   "source": [
    "%autoreload\n",
    "import global_utils\n",
    "ft_vector_files = [\n",
    "                   (100,\"ft_cbow_100d_20e\"),(200,\"ft_cbow_200d_20e\"),(200,\"ft_cbow_300d_20e\"),\n",
    "                   (100,\"ft_sg_100d_20e\"),(200,\"ft_sg_200d_20e\"),(200,\"ft_sg_300d_20e\"),\n",
    "                   (100,\"ft_cbow_100d_50e\"),(200,\"ft_cbow_200d_50e\"),(200,\"ft_cbow_300d_50e\"),\n",
    "                   (100,\"ft_sg_100d_50e\"),(200,\"ft_sg_200d_50e\"),(200,\"ft_sg_300d_50e\"),\n",
    "                   (100,\"ft_cbow_100d_100e\"),(200,\"ft_cbow_200d_100e\"),(200,\"ft_cbow_300d_100e\"),\n",
    "                   (100,\"ft_sg_100d_100e\"),(200,\"ft_sg_200d_100e\"),(200,\"ft_sg_300d_100e\")\n",
    "                  ]\n",
    "\n",
    "for dim_file_name in ft_vector_files:\n",
    "    file_path = \"/home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/\"+dim_file_name[1]+\".vec\"\n",
    "    dim = dim_file_name[0]\n",
    "    if not os.path.exists(file_path):\n",
    "        print(\"file doesnt exist\",file_path)\n",
    "        continue\n",
    "    ft_vec = global_utils.get_corpus_wvs_from_ft(file_path, dim, vocab_words)\n",
    "    print(ft_vector_file,ft_vec.shape)\n",
    "    np.save(\"processed/stage1/pretrained_word_vectors/\"+dim_file_name[1]+\".npy\", ft_vec)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-09-22T23:53:19.466957Z",
     "start_time": "2017-09-22T23:53:19.463994Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(367260, 200)"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Viewing word vectors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-11-06T08:22:17.637150Z",
     "start_time": "2017-11-06T08:22:17.604738Z"
    }
   },
   "outputs": [],
   "source": [
    "%autoreload\n",
    "import global_utils"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-11-06T08:23:26.168121Z",
     "start_time": "2017-11-06T08:23:02.868090Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(352220, 200)"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "WORD_EMB_SIZE=200\n",
    "ft_file_path = \"/home/bicepjai/Projects/Deep-Survey-Text-Classification/data_prep/processed/stage1/pretrained_word_vectors/ft_sg_200d_50e.vec\"\n",
    "trained_embeddings = global_utils.get_embeddings_from_ft(ft_file_path, WORD_EMB_SIZE, corpus_vocab_list)\n",
    "trained_embeddings.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-11-06T08:25:27.031351Z",
     "start_time": "2017-11-06T08:25:27.026685Z"
    }
   },
   "outputs": [],
   "source": [
    "tb_vocab_size=5000"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-11-06T08:25:27.713851Z",
     "start_time": "2017-11-06T08:25:27.455322Z"
    }
   },
   "outputs": [],
   "source": [
    "tb_vocab_biolab = list(trained_embeddings)[:tb_vocab_size]\n",
    "with open(\"view_wvs_tb/tb_vocab.tsv\", \"w\") as fp:\n",
    "    wr = csv.writer(fp, delimiter='\\n')\n",
    "    wr.writerow(corpus_vocab_list)\n",
    "\n",
    "tb_word_vectors = np.random.randn(tb_vocab_size, 200)\n",
    "for i,word in enumerate(tb_vocab_biolab):\n",
    "    tb_word_vectors[i] = trained_embeddings[i]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-11-06T08:34:20.909670Z",
     "start_time": "2017-11-06T08:34:20.754171Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(5000, 200)\n"
     ]
    }
   ],
   "source": [
    "%autoreload\n",
    "from utils import visualize_embeddings_in_tensorboard\n",
    "visualize_this_embedding = tb_word_vectors\n",
    "print(visualize_this_embedding.shape)\n",
    "metadata_path = \"/home/bicepjai/Projects/Deep-Survey-Text-Classification/data_prep/view_wvs_tb/tb_vocab.tsv\"\n",
    "visualize_embeddings_in_tensorboard(visualize_this_embedding, metadata_path, \"/home/bicepjai/Projects/Deep-Survey-Text-Classification/data_prep/view_wvs_tb\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  },
  "toc": {
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "toc_cell": false,
   "toc_position": {
    "height": "788px",
    "left": "0px",
    "right": "1249px",
    "top": "52px",
    "width": "329px"
   },
   "toc_section_display": "block",
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
